1 /*
    2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Constant data definitions
   83 
   84 static const uint32_t _sha256_round_consts[64] = {
   85   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   86   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   87   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
   88   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
   89   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
   90   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
   91   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
   92   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
   93   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
   94   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
   95   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
   96   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
   97   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
   98   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
   99   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  100   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  101 };
  102 
  103 static const uint64_t _sha512_round_consts[80] = {
  104   0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
  105   0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
  106   0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
  107   0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
  108   0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
  109   0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
  110   0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
  111   0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
  112   0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
  113   0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
  114   0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
  115   0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
  116   0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
  117   0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
  118   0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
  119   0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
  120   0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
  121   0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
  122   0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
  123   0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
  124   0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
  125   0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
  126   0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
  127   0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
  128   0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
  129   0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
  130   0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
  131 };
  132 
  133 static const uint64_t _sha3_round_consts[24] = {
  134   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  135   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  136   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  137   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  138   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  139   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  140   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  141   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  142 };
  143 
  144 static const uint64_t _double_keccak_round_consts[24] = {
  145   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  146   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  147   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  148   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  149   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  150   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  151   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  152   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  153 };
  154 
  155 //Omit 3rd limb of modulus since it is 0
  156 static const int64_t _modulus_P256[5] = {
  157   0x000fffffffffffffL, 0x00000fffffffffffL,
  158   0x0000001000000000L, 0x0000ffffffff0000L
  159 };
  160 
  161 static const char _encodeBlock_toBase64[64] = {
  162   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  163   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  164   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  165   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  166   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
  167 };
  168 
  169 static const char _encodeBlock_toBase64URL[64] = {
  170   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  171   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  172   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  173   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  174   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
  175 };
  176 
  177 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
  178 // except the trailing character '=' is also treated illegal value in this intrinsic. That
  179 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
  180 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
  181   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  182   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  183   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  184   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  185   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  186   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
  187   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  188   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  189   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  190   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  191   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  192   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  193   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  194   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  195   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  196   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  197 };
  198 
  199 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
  200   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  201   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  202   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  203   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  204   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  205   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
  206   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  207   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  208   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  209   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  210   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  211   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  212   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  213   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  214   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  215   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  216 };
  217 
  218 // A legal value of base64 code is in range [0, 127].  We need two lookups
  219 // with tbl/tbx and combine them to get the decode data. The 1st table vector
  220 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
  221 // table vector lookup use tbx, out of range indices are unchanged in
  222 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
  223 // The value of index 64 is set to 0, so that we know that we already get the
  224 // decoded data with the 1st lookup.
  225 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
  226   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  227   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  228   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  229   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  230   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  231   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  232   255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  233   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  234 };
  235 
  236 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
  237   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  238   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  239   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  240   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  241   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  242   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  243   63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  244   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  245 };
  246 
  247 
  248 // Stub Code definitions
  249 
  250 class StubGenerator: public StubCodeGenerator {
  251  private:
  252 
  253 #ifdef PRODUCT
  254 #define inc_counter_np(counter) ((void)0)
  255 #else
  256   void inc_counter_np_(uint& counter) {
  257     __ incrementw(ExternalAddress((address)&counter));
  258   }
  259 #define inc_counter_np(counter) \
  260   BLOCK_COMMENT("inc_counter " #counter); \
  261   inc_counter_np_(counter);
  262 #endif
  263 
  264   // Call stubs are used to call Java from C
  265   //
  266   // Arguments:
  267   //    c_rarg0:   call wrapper address                   address
  268   //    c_rarg1:   result                                 address
  269   //    c_rarg2:   result type                            BasicType
  270   //    c_rarg3:   method                                 Method*
  271   //    c_rarg4:   (interpreter) entry point              address
  272   //    c_rarg5:   parameters                             intptr_t*
  273   //    c_rarg6:   parameter size (in words)              int
  274   //    c_rarg7:   thread                                 Thread*
  275   //
  276   // There is no return from the stub itself as any Java result
  277   // is written to result
  278   //
  279   // we save r30 (lr) as the return PC at the base of the frame and
  280   // link r29 (fp) below it as the frame pointer installing sp (r31)
  281   // into fp.
  282   //
  283   // we save r0-r7, which accounts for all the c arguments.
  284   //
  285   // TODO: strictly do we need to save them all? they are treated as
  286   // volatile by C so could we omit saving the ones we are going to
  287   // place in global registers (thread? method?) or those we only use
  288   // during setup of the Java call?
  289   //
  290   // we don't need to save r8 which C uses as an indirect result location
  291   // return register.
  292   //
  293   // we don't need to save r9-r15 which both C and Java treat as
  294   // volatile
  295   //
  296   // we don't need to save r16-18 because Java does not use them
  297   //
  298   // we save r19-r28 which Java uses as scratch registers and C
  299   // expects to be callee-save
  300   //
  301   // we save the bottom 64 bits of each value stored in v8-v15; it is
  302   // the responsibility of the caller to preserve larger values.
  303   //
  304   // so the stub frame looks like this when we enter Java code
  305   //
  306   //     [ return_from_Java     ] <--- sp
  307   //     [ argument word n      ]
  308   //      ...
  309   // -29 [ argument word 1      ]
  310   // -28 [ saved Floating-point Control Register ]
  311   // -26 [ saved v15            ] <--- sp_after_call
  312   // -25 [ saved v14            ]
  313   // -24 [ saved v13            ]
  314   // -23 [ saved v12            ]
  315   // -22 [ saved v11            ]
  316   // -21 [ saved v10            ]
  317   // -20 [ saved v9             ]
  318   // -19 [ saved v8             ]
  319   // -18 [ saved r28            ]
  320   // -17 [ saved r27            ]
  321   // -16 [ saved r26            ]
  322   // -15 [ saved r25            ]
  323   // -14 [ saved r24            ]
  324   // -13 [ saved r23            ]
  325   // -12 [ saved r22            ]
  326   // -11 [ saved r21            ]
  327   // -10 [ saved r20            ]
  328   //  -9 [ saved r19            ]
  329   //  -8 [ call wrapper    (r0) ]
  330   //  -7 [ result          (r1) ]
  331   //  -6 [ result type     (r2) ]
  332   //  -5 [ method          (r3) ]
  333   //  -4 [ entry point     (r4) ]
  334   //  -3 [ parameters      (r5) ]
  335   //  -2 [ parameter size  (r6) ]
  336   //  -1 [ thread (r7)          ]
  337   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  338   //   1 [ saved lr       (r30) ]
  339 
  340   // Call stub stack layout word offsets from fp
  341   enum call_stub_layout {
  342     sp_after_call_off  = -28,
  343 
  344     fpcr_off           = sp_after_call_off,
  345     d15_off            = -26,
  346     d13_off            = -24,
  347     d11_off            = -22,
  348     d9_off             = -20,
  349 
  350     r28_off            = -18,
  351     r26_off            = -16,
  352     r24_off            = -14,
  353     r22_off            = -12,
  354     r20_off            = -10,
  355     call_wrapper_off   =  -8,
  356     result_off         =  -7,
  357     result_type_off    =  -6,
  358     method_off         =  -5,
  359     entry_point_off    =  -4,
  360     parameter_size_off =  -2,
  361     thread_off         =  -1,
  362     fp_f               =   0,
  363     retaddr_off        =   1,
  364   };
  365 
  366   address generate_call_stub(address& return_address) {
  367     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  368            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  369            "adjust this code");
  370 
  371     StubId stub_id = StubId::stubgen_call_stub_id;
  372     GrowableArray<address> entries;
  373     int entry_count = StubInfo::entry_count(stub_id);
  374     assert(entry_count == 2, "sanity check");
  375     address start = load_archive_data(stub_id, &entries);
  376     if (start != nullptr) {
  377       assert(entries.length() == 1, "expected 1 extra entry");
  378       return_address = entries.at(0);
  379       return start;
  380     }
  381     StubCodeMark mark(this, stub_id);
  382     start = __ pc();
  383 
  384     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  385 
  386     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  387     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  388     const Address result        (rfp, result_off         * wordSize);
  389     const Address result_type   (rfp, result_type_off    * wordSize);
  390     const Address method        (rfp, method_off         * wordSize);
  391     const Address entry_point   (rfp, entry_point_off    * wordSize);
  392     const Address parameter_size(rfp, parameter_size_off * wordSize);
  393 
  394     const Address thread        (rfp, thread_off         * wordSize);
  395 
  396     const Address d15_save      (rfp, d15_off * wordSize);
  397     const Address d13_save      (rfp, d13_off * wordSize);
  398     const Address d11_save      (rfp, d11_off * wordSize);
  399     const Address d9_save       (rfp, d9_off * wordSize);
  400 
  401     const Address r28_save      (rfp, r28_off * wordSize);
  402     const Address r26_save      (rfp, r26_off * wordSize);
  403     const Address r24_save      (rfp, r24_off * wordSize);
  404     const Address r22_save      (rfp, r22_off * wordSize);
  405     const Address r20_save      (rfp, r20_off * wordSize);
  406 
  407     // stub code
  408 
  409     address aarch64_entry = __ pc();
  410 
  411     // set up frame and move sp to end of save area
  412     __ enter();
  413     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  414 
  415     // save register parameters and Java scratch/global registers
  416     // n.b. we save thread even though it gets installed in
  417     // rthread because we want to sanity check rthread later
  418     __ str(c_rarg7,  thread);
  419     __ strw(c_rarg6, parameter_size);
  420     __ stp(c_rarg4, c_rarg5,  entry_point);
  421     __ stp(c_rarg2, c_rarg3,  result_type);
  422     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  423 
  424     __ stp(r20, r19,   r20_save);
  425     __ stp(r22, r21,   r22_save);
  426     __ stp(r24, r23,   r24_save);
  427     __ stp(r26, r25,   r26_save);
  428     __ stp(r28, r27,   r28_save);
  429 
  430     __ stpd(v9,  v8,   d9_save);
  431     __ stpd(v11, v10,  d11_save);
  432     __ stpd(v13, v12,  d13_save);
  433     __ stpd(v15, v14,  d15_save);
  434 
  435     __ get_fpcr(rscratch1);
  436     __ str(rscratch1, fpcr_save);
  437     // Set FPCR to the state we need. We do want Round to Nearest. We
  438     // don't want non-IEEE rounding modes or floating-point traps.
  439     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  440     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  441     __ set_fpcr(rscratch1);
  442 
  443     // install Java thread in global register now we have saved
  444     // whatever value it held
  445     __ mov(rthread, c_rarg7);
  446     // And method
  447     __ mov(rmethod, c_rarg3);
  448 
  449     // set up the heapbase register
  450     __ reinit_heapbase();
  451 
  452 #ifdef ASSERT
  453     // make sure we have no pending exceptions
  454     {
  455       Label L;
  456       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  457       __ cmp(rscratch1, (u1)NULL_WORD);
  458       __ br(Assembler::EQ, L);
  459       __ stop("StubRoutines::call_stub: entered with pending exception");
  460       __ BIND(L);
  461     }
  462 #endif
  463     // pass parameters if any
  464     __ mov(esp, sp);
  465     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  466     __ andr(sp, rscratch1, -2 * wordSize);
  467 
  468     BLOCK_COMMENT("pass parameters if any");
  469     Label parameters_done;
  470     // parameter count is still in c_rarg6
  471     // and parameter pointer identifying param 1 is in c_rarg5
  472     __ cbzw(c_rarg6, parameters_done);
  473 
  474     address loop = __ pc();
  475     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  476     __ subsw(c_rarg6, c_rarg6, 1);
  477     __ push(rscratch1);
  478     __ br(Assembler::GT, loop);
  479 
  480     __ BIND(parameters_done);
  481 
  482     // call Java entry -- passing methdoOop, and current sp
  483     //      rmethod: Method*
  484     //      r19_sender_sp: sender sp
  485     BLOCK_COMMENT("call Java function");
  486     __ mov(r19_sender_sp, sp);
  487     __ blr(c_rarg4);
  488 
  489     // we do this here because the notify will already have been done
  490     // if we get to the next instruction via an exception
  491     //
  492     // n.b. adding this instruction here affects the calculation of
  493     // whether or not a routine returns to the call stub (used when
  494     // doing stack walks) since the normal test is to check the return
  495     // pc against the address saved below. so we may need to allow for
  496     // this extra instruction in the check.
  497 
  498     // save current address for use by exception handling code
  499 
  500     return_address = __ pc();
  501     entries.append(return_address);
  502 
  503     // store result depending on type (everything that is not
  504     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  505     // n.b. this assumes Java returns an integral result in r0
  506     // and a floating result in j_farg0
  507     __ ldr(j_rarg2, result);
  508     Label is_long, is_float, is_double, exit;
  509     __ ldr(j_rarg1, result_type);
  510     __ cmp(j_rarg1, (u1)T_OBJECT);
  511     __ br(Assembler::EQ, is_long);
  512     __ cmp(j_rarg1, (u1)T_LONG);
  513     __ br(Assembler::EQ, is_long);
  514     __ cmp(j_rarg1, (u1)T_FLOAT);
  515     __ br(Assembler::EQ, is_float);
  516     __ cmp(j_rarg1, (u1)T_DOUBLE);
  517     __ br(Assembler::EQ, is_double);
  518 
  519     // handle T_INT case
  520     __ strw(r0, Address(j_rarg2));
  521 
  522     __ BIND(exit);
  523 
  524     // pop parameters
  525     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  526 
  527 #ifdef ASSERT
  528     // verify that threads correspond
  529     {
  530       Label L, S;
  531       __ ldr(rscratch1, thread);
  532       __ cmp(rthread, rscratch1);
  533       __ br(Assembler::NE, S);
  534       __ get_thread(rscratch1);
  535       __ cmp(rthread, rscratch1);
  536       __ br(Assembler::EQ, L);
  537       __ BIND(S);
  538       __ stop("StubRoutines::call_stub: threads must correspond");
  539       __ BIND(L);
  540     }
  541 #endif
  542 
  543     __ pop_cont_fastpath(rthread);
  544 
  545     // restore callee-save registers
  546     __ ldpd(v15, v14,  d15_save);
  547     __ ldpd(v13, v12,  d13_save);
  548     __ ldpd(v11, v10,  d11_save);
  549     __ ldpd(v9,  v8,   d9_save);
  550 
  551     __ ldp(r28, r27,   r28_save);
  552     __ ldp(r26, r25,   r26_save);
  553     __ ldp(r24, r23,   r24_save);
  554     __ ldp(r22, r21,   r22_save);
  555     __ ldp(r20, r19,   r20_save);
  556 
  557     // restore fpcr
  558     __ ldr(rscratch1,  fpcr_save);
  559     __ set_fpcr(rscratch1);
  560 
  561     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  562     __ ldrw(c_rarg2, result_type);
  563     __ ldr(c_rarg3,  method);
  564     __ ldp(c_rarg4, c_rarg5,  entry_point);
  565     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  566 
  567     // leave frame and return to caller
  568     __ leave();
  569     __ ret(lr);
  570 
  571     // handle return types different from T_INT
  572 
  573     __ BIND(is_long);
  574     __ str(r0, Address(j_rarg2, 0));
  575     __ br(Assembler::AL, exit);
  576 
  577     __ BIND(is_float);
  578     __ strs(j_farg0, Address(j_rarg2, 0));
  579     __ br(Assembler::AL, exit);
  580 
  581     __ BIND(is_double);
  582     __ strd(j_farg0, Address(j_rarg2, 0));
  583     __ br(Assembler::AL, exit);
  584 
  585     // record the stub entry and end plus the auxiliary entry
  586     store_archive_data(stub_id, start, __ pc(), &entries);
  587 
  588     return start;
  589   }
  590 
  591   // Return point for a Java call if there's an exception thrown in
  592   // Java code.  The exception is caught and transformed into a
  593   // pending exception stored in JavaThread that can be tested from
  594   // within the VM.
  595   //
  596   // Note: Usually the parameters are removed by the callee. In case
  597   // of an exception crossing an activation frame boundary, that is
  598   // not the case if the callee is compiled code => need to setup the
  599   // rsp.
  600   //
  601   // r0: exception oop
  602 
  603   address generate_catch_exception() {
  604     StubId stub_id = StubId::stubgen_catch_exception_id;
  605     int entry_count = StubInfo::entry_count(stub_id);
  606     assert(entry_count == 1, "sanity check");
  607     address start = load_archive_data(stub_id);
  608     if (start != nullptr) {
  609       return start;
  610     }
  611     StubCodeMark mark(this, stub_id);
  612     start = __ pc();
  613 
  614     // same as in generate_call_stub():
  615     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  616     const Address thread        (rfp, thread_off         * wordSize);
  617 
  618 #ifdef ASSERT
  619     // verify that threads correspond
  620     {
  621       Label L, S;
  622       __ ldr(rscratch1, thread);
  623       __ cmp(rthread, rscratch1);
  624       __ br(Assembler::NE, S);
  625       __ get_thread(rscratch1);
  626       __ cmp(rthread, rscratch1);
  627       __ br(Assembler::EQ, L);
  628       __ bind(S);
  629       __ stop("StubRoutines::catch_exception: threads must correspond");
  630       __ bind(L);
  631     }
  632 #endif
  633 
  634     // set pending exception
  635     __ verify_oop(r0);
  636 
  637     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  638     // special case -- add file name string to AOT address table
  639     address file = (address)AOTCodeCache::add_C_string(__FILE__);
  640     __ lea(rscratch1, ExternalAddress(file));
  641     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  642     __ movw(rscratch1, (int)__LINE__);
  643     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  644 
  645     // complete return to VM
  646     assert(StubRoutines::_call_stub_return_address != nullptr,
  647            "_call_stub_return_address must have been generated before");
  648     __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
  649 
  650     // record the stub entry and end
  651     store_archive_data(stub_id, start, __ pc());
  652 
  653     return start;
  654   }
  655 
  656   // Continuation point for runtime calls returning with a pending
  657   // exception.  The pending exception check happened in the runtime
  658   // or native call stub.  The pending exception in Thread is
  659   // converted into a Java-level exception.
  660   //
  661   // Contract with Java-level exception handlers:
  662   // r0: exception
  663   // r3: throwing pc
  664   //
  665   // NOTE: At entry of this stub, exception-pc must be in LR !!
  666 
  667   // NOTE: this is always used as a jump target within generated code
  668   // so it just needs to be generated code with no x86 prolog
  669 
  670   address generate_forward_exception() {
  671     StubId stub_id = StubId::stubgen_forward_exception_id;
  672     int entry_count = StubInfo::entry_count(stub_id);
  673     assert(entry_count == 1, "sanity check");
  674     address start = load_archive_data(stub_id);
  675     if (start != nullptr) {
  676       return start;
  677     }
  678     StubCodeMark mark(this, stub_id);
  679     start = __ pc();
  680 
  681     // Upon entry, LR points to the return address returning into
  682     // Java (interpreted or compiled) code; i.e., the return address
  683     // becomes the throwing pc.
  684     //
  685     // Arguments pushed before the runtime call are still on the stack
  686     // but the exception handler will reset the stack pointer ->
  687     // ignore them.  A potential result in registers can be ignored as
  688     // well.
  689 
  690 #ifdef ASSERT
  691     // make sure this code is only executed if there is a pending exception
  692     {
  693       Label L;
  694       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  695       __ cbnz(rscratch1, L);
  696       __ stop("StubRoutines::forward exception: no pending exception (1)");
  697       __ bind(L);
  698     }
  699 #endif
  700 
  701     // compute exception handler into r19
  702 
  703     // call the VM to find the handler address associated with the
  704     // caller address. pass thread in r0 and caller pc (ret address)
  705     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  706     // the stack.
  707     __ mov(c_rarg1, lr);
  708     // lr will be trashed by the VM call so we move it to R19
  709     // (callee-saved) because we also need to pass it to the handler
  710     // returned by this call.
  711     __ mov(r19, lr);
  712     BLOCK_COMMENT("call exception_handler_for_return_address");
  713     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  714                          SharedRuntime::exception_handler_for_return_address),
  715                     rthread, c_rarg1);
  716     // Reinitialize the ptrue predicate register, in case the external runtime
  717     // call clobbers ptrue reg, as we may return to SVE compiled code.
  718     __ reinitialize_ptrue();
  719 
  720     // we should not really care that lr is no longer the callee
  721     // address. we saved the value the handler needs in r19 so we can
  722     // just copy it to r3. however, the C2 handler will push its own
  723     // frame and then calls into the VM and the VM code asserts that
  724     // the PC for the frame above the handler belongs to a compiled
  725     // Java method. So, we restore lr here to satisfy that assert.
  726     __ mov(lr, r19);
  727     // setup r0 & r3 & clear pending exception
  728     __ mov(r3, r19);
  729     __ mov(r19, r0);
  730     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  731     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  732 
  733 #ifdef ASSERT
  734     // make sure exception is set
  735     {
  736       Label L;
  737       __ cbnz(r0, L);
  738       __ stop("StubRoutines::forward exception: no pending exception (2)");
  739       __ bind(L);
  740     }
  741 #endif
  742 
  743     // continue at exception handler
  744     // r0: exception
  745     // r3: throwing pc
  746     // r19: exception handler
  747     __ verify_oop(r0);
  748     __ br(r19);
  749 
  750     // record the stub entry and end
  751     store_archive_data(stub_id, start, __ pc());
  752 
  753     return start;
  754   }
  755 
  756   // Non-destructive plausibility checks for oops
  757   //
  758   // Arguments:
  759   //    r0: oop to verify
  760   //    rscratch1: error message
  761   //
  762   // Stack after saving c_rarg3:
  763   //    [tos + 0]: saved c_rarg3
  764   //    [tos + 1]: saved c_rarg2
  765   //    [tos + 2]: saved lr
  766   //    [tos + 3]: saved rscratch2
  767   //    [tos + 4]: saved r0
  768   //    [tos + 5]: saved rscratch1
  769   address generate_verify_oop() {
  770     StubId stub_id = StubId::stubgen_verify_oop_id;
  771     int entry_count = StubInfo::entry_count(stub_id);
  772     assert(entry_count == 1, "sanity check");
  773     address start = load_archive_data(stub_id);
  774     if (start != nullptr) {
  775       return start;
  776     }
  777     StubCodeMark mark(this, stub_id);
  778     start = __ pc();
  779 
  780     Label exit, error;
  781 
  782     // save c_rarg2 and c_rarg3
  783     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  784 
  785     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  786     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  787     __ ldr(c_rarg3, Address(c_rarg2));
  788     __ add(c_rarg3, c_rarg3, 1);
  789     __ str(c_rarg3, Address(c_rarg2));
  790 
  791     // object is in r0
  792     // make sure object is 'reasonable'
  793     __ cbz(r0, exit); // if obj is null it is OK
  794 
  795     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  796     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  797 
  798     // return if everything seems ok
  799     __ bind(exit);
  800 
  801     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  802     __ ret(lr);
  803 
  804     // handle errors
  805     __ bind(error);
  806     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  807 
  808     __ push(RegSet::range(r0, r29), sp);
  809     // debug(char* msg, int64_t pc, int64_t regs[])
  810     __ mov(c_rarg0, rscratch1);      // pass address of error message
  811     __ mov(c_rarg1, lr);             // pass return address
  812     __ mov(c_rarg2, sp);             // pass address of regs on stack
  813 #ifndef PRODUCT
  814     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  815 #endif
  816     BLOCK_COMMENT("call MacroAssembler::debug");
  817     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  818     __ blr(rscratch1);
  819     __ hlt(0);
  820 
  821     // record the stub entry and end
  822     store_archive_data(stub_id, start, __ pc());
  823 
  824     return start;
  825   }
  826 
  827   // Generate indices for iota vector.
  828   void generate_iota_indices(StubId stub_id) {
  829     GrowableArray<address> entries;
  830     int entry_count = StubInfo::entry_count(stub_id);
  831     assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
  832     address start = load_archive_data(stub_id, &entries);
  833     if (start != nullptr) {
  834       assert(entries.length() == entry_count - 1,
  835              "unexpected entries count %d", entries.length());
  836       StubRoutines::aarch64::_vector_iota_indices[0] = start;
  837       for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  838         StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  839       }
  840       return;
  841     }
  842     __ align(CodeEntryAlignment);
  843     StubCodeMark mark(this, stub_id);
  844     start = __ pc();
  845     // B
  846     __ emit_data64(0x0706050403020100, relocInfo::none);
  847     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  848     entries.append(__ pc());
  849     // H
  850     __ emit_data64(0x0003000200010000, relocInfo::none);
  851     __ emit_data64(0x0007000600050004, relocInfo::none);
  852     entries.append(__ pc());
  853     // S
  854     __ emit_data64(0x0000000100000000, relocInfo::none);
  855     __ emit_data64(0x0000000300000002, relocInfo::none);
  856     entries.append(__ pc());
  857     // D
  858     __ emit_data64(0x0000000000000000, relocInfo::none);
  859     __ emit_data64(0x0000000000000001, relocInfo::none);
  860     entries.append(__ pc());
  861     // S - FP
  862     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  863     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  864     entries.append(__ pc());
  865     // D - FP
  866     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  867     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  868 
  869     // record the stub entry and end
  870     store_archive_data(stub_id, start, __ pc(), &entries);
  871 
  872     // install the entry addresses in the entry array
  873     assert(entries.length() == entry_count - 1,
  874            "unexpected entries count %d", entries.length());
  875     StubRoutines::aarch64::_vector_iota_indices[0] = start;
  876     for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  877       StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  878     }
  879   }
  880 
  881   // The inner part of zero_words().  This is the bulk operation,
  882   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  883   // caller is responsible for zeroing the last few words.
  884   //
  885   // Inputs:
  886   // r10: the HeapWord-aligned base address of an array to zero.
  887   // r11: the count in HeapWords, r11 > 0.
  888   //
  889   // Returns r10 and r11, adjusted for the caller to clear.
  890   // r10: the base address of the tail of words left to clear.
  891   // r11: the number of words in the tail.
  892   //      r11 < MacroAssembler::zero_words_block_size.
  893 
  894   address generate_zero_blocks() {
  895     StubId stub_id = StubId::stubgen_zero_blocks_id;
  896     int entry_count = StubInfo::entry_count(stub_id);
  897     assert(entry_count == 1, "sanity check");
  898     address start = load_archive_data(stub_id);
  899     if (start != nullptr) {
  900       return start;
  901     }
  902     __ align(CodeEntryAlignment);
  903     StubCodeMark mark(this, stub_id);
  904     Label done;
  905     Label base_aligned;
  906 
  907     Register base = r10, cnt = r11;
  908 
  909     start = __ pc();
  910 
  911     if (UseBlockZeroing) {
  912       int zva_length = VM_Version::zva_length();
  913 
  914       // Ensure ZVA length can be divided by 16. This is required by
  915       // the subsequent operations.
  916       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  917 
  918       __ tbz(base, 3, base_aligned);
  919       __ str(zr, Address(__ post(base, 8)));
  920       __ sub(cnt, cnt, 1);
  921       __ bind(base_aligned);
  922 
  923       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  924       // alignment.
  925       Label small;
  926       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  927       __ subs(rscratch1, cnt, low_limit >> 3);
  928       __ br(Assembler::LT, small);
  929       __ zero_dcache_blocks(base, cnt);
  930       __ bind(small);
  931     }
  932 
  933     {
  934       // Number of stp instructions we'll unroll
  935       const int unroll =
  936         MacroAssembler::zero_words_block_size / 2;
  937       // Clear the remaining blocks.
  938       Label loop;
  939       __ subs(cnt, cnt, unroll * 2);
  940       __ br(Assembler::LT, done);
  941       __ bind(loop);
  942       for (int i = 0; i < unroll; i++)
  943         __ stp(zr, zr, __ post(base, 16));
  944       __ subs(cnt, cnt, unroll * 2);
  945       __ br(Assembler::GE, loop);
  946       __ bind(done);
  947       __ add(cnt, cnt, unroll * 2);
  948     }
  949 
  950     __ ret(lr);
  951 
  952     // record the stub entry and end
  953     store_archive_data(stub_id, start, __ pc());
  954 
  955     return start;
  956   }
  957 
  958 
  959   typedef enum {
  960     copy_forwards = 1,
  961     copy_backwards = -1
  962   } copy_direction;
  963 
  964   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  965   // for arraycopy stubs.
  966   class ArrayCopyBarrierSetHelper : StackObj {
  967     BarrierSetAssembler* _bs_asm;
  968     MacroAssembler* _masm;
  969     DecoratorSet _decorators;
  970     BasicType _type;
  971     Register _gct1;
  972     Register _gct2;
  973     Register _gct3;
  974     FloatRegister _gcvt1;
  975     FloatRegister _gcvt2;
  976     FloatRegister _gcvt3;
  977 
  978   public:
  979     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  980                               DecoratorSet decorators,
  981                               BasicType type,
  982                               Register gct1,
  983                               Register gct2,
  984                               Register gct3,
  985                               FloatRegister gcvt1,
  986                               FloatRegister gcvt2,
  987                               FloatRegister gcvt3)
  988       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  989         _masm(masm),
  990         _decorators(decorators),
  991         _type(type),
  992         _gct1(gct1),
  993         _gct2(gct2),
  994         _gct3(gct3),
  995         _gcvt1(gcvt1),
  996         _gcvt2(gcvt2),
  997         _gcvt3(gcvt3) {
  998     }
  999 
 1000     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 1001       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 1002                             dst1, dst2, src,
 1003                             _gct1, _gct2, _gcvt1);
 1004     }
 1005 
 1006     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 1007       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 1008                              dst, src1, src2,
 1009                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 1010     }
 1011 
 1012     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 1013       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 1014                             dst1, dst2, src,
 1015                             _gct1);
 1016     }
 1017 
 1018     void copy_store_at_16(Address dst, Register src1, Register src2) {
 1019       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 1020                              dst, src1, src2,
 1021                              _gct1, _gct2, _gct3);
 1022     }
 1023 
 1024     void copy_load_at_8(Register dst, Address src) {
 1025       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 1026                             dst, noreg, src,
 1027                             _gct1);
 1028     }
 1029 
 1030     void copy_store_at_8(Address dst, Register src) {
 1031       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 1032                              dst, src, noreg,
 1033                              _gct1, _gct2, _gct3);
 1034     }
 1035   };
 1036 
 1037   // Bulk copy of blocks of 8 words.
 1038   //
 1039   // count is a count of words.
 1040   //
 1041   // Precondition: count >= 8
 1042   //
 1043   // Postconditions:
 1044   //
 1045   // The least significant bit of count contains the remaining count
 1046   // of words to copy.  The rest of count is trash.
 1047   //
 1048   // s and d are adjusted to point to the remaining words to copy
 1049   //
 1050   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
 1051     int entry_count = StubInfo::entry_count(stub_id);
 1052     assert(entry_count == 1, "sanity check");
 1053     address start = load_archive_data(stub_id);
 1054     if (start != nullptr) {
 1055       return start;
 1056     }
 1057     BasicType type;
 1058     copy_direction direction;
 1059 
 1060     switch (stub_id) {
 1061     case StubId::stubgen_copy_byte_f_id:
 1062       direction = copy_forwards;
 1063       type = T_BYTE;
 1064       break;
 1065     case StubId::stubgen_copy_byte_b_id:
 1066       direction = copy_backwards;
 1067       type = T_BYTE;
 1068       break;
 1069     case StubId::stubgen_copy_oop_f_id:
 1070       direction = copy_forwards;
 1071       type = T_OBJECT;
 1072       break;
 1073     case StubId::stubgen_copy_oop_b_id:
 1074       direction = copy_backwards;
 1075       type = T_OBJECT;
 1076       break;
 1077     case StubId::stubgen_copy_oop_uninit_f_id:
 1078       direction = copy_forwards;
 1079       type = T_OBJECT;
 1080       break;
 1081     case StubId::stubgen_copy_oop_uninit_b_id:
 1082       direction = copy_backwards;
 1083       type = T_OBJECT;
 1084       break;
 1085     default:
 1086       ShouldNotReachHere();
 1087     }
 1088 
 1089     int unit = wordSize * direction;
 1090     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 1091 
 1092     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 1093       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 1094     const Register stride = r14;
 1095     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1096     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1097     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1098 
 1099     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 1100     assert_different_registers(s, d, count, rscratch1, rscratch2);
 1101 
 1102     Label again, drain;
 1103 
 1104     __ align(CodeEntryAlignment);
 1105 
 1106     StubCodeMark mark(this, stub_id);
 1107 
 1108     start = __ pc();
 1109 
 1110     Label unaligned_copy_long;
 1111     if (AvoidUnalignedAccesses) {
 1112       __ tbnz(d, 3, unaligned_copy_long);
 1113     }
 1114 
 1115     if (direction == copy_forwards) {
 1116       __ sub(s, s, bias);
 1117       __ sub(d, d, bias);
 1118     }
 1119 
 1120 #ifdef ASSERT
 1121     // Make sure we are never given < 8 words
 1122     {
 1123       Label L;
 1124       __ cmp(count, (u1)8);
 1125       __ br(Assembler::GE, L);
 1126       __ stop("genrate_copy_longs called with < 8 words");
 1127       __ bind(L);
 1128     }
 1129 #endif
 1130 
 1131     // Fill 8 registers
 1132     if (UseSIMDForMemoryOps) {
 1133       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1134       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1135     } else {
 1136       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1137       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1138       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1139       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1140     }
 1141 
 1142     __ subs(count, count, 16);
 1143     __ br(Assembler::LO, drain);
 1144 
 1145     int prefetch = PrefetchCopyIntervalInBytes;
 1146     bool use_stride = false;
 1147     if (direction == copy_backwards) {
 1148       use_stride = prefetch > 256;
 1149       prefetch = -prefetch;
 1150       if (use_stride) __ mov(stride, prefetch);
 1151     }
 1152 
 1153     __ bind(again);
 1154 
 1155     if (PrefetchCopyIntervalInBytes > 0)
 1156       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1157 
 1158     if (UseSIMDForMemoryOps) {
 1159       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1160       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1161       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1162       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1163     } else {
 1164       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1165       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1166       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1167       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1168       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1169       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1170       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1171       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1172     }
 1173 
 1174     __ subs(count, count, 8);
 1175     __ br(Assembler::HS, again);
 1176 
 1177     // Drain
 1178     __ bind(drain);
 1179     if (UseSIMDForMemoryOps) {
 1180       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1181       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1182     } else {
 1183       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1184       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1185       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1186       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1187     }
 1188 
 1189     {
 1190       Label L1, L2;
 1191       __ tbz(count, exact_log2(4), L1);
 1192       if (UseSIMDForMemoryOps) {
 1193         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 1194         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 1195       } else {
 1196         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1197         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1198         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1199         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 1200       }
 1201       __ bind(L1);
 1202 
 1203       if (direction == copy_forwards) {
 1204         __ add(s, s, bias);
 1205         __ add(d, d, bias);
 1206       }
 1207 
 1208       __ tbz(count, 1, L2);
 1209       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 1210       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 1211       __ bind(L2);
 1212     }
 1213 
 1214     __ ret(lr);
 1215 
 1216     if (AvoidUnalignedAccesses) {
 1217       Label drain, again;
 1218       // Register order for storing. Order is different for backward copy.
 1219 
 1220       __ bind(unaligned_copy_long);
 1221 
 1222       // source address is even aligned, target odd aligned
 1223       //
 1224       // when forward copying word pairs we read long pairs at offsets
 1225       // {0, 2, 4, 6} (in long words). when backwards copying we read
 1226       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 1227       // address by -2 in the forwards case so we can compute the
 1228       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 1229       // or -1.
 1230       //
 1231       // when forward copying we need to store 1 word, 3 pairs and
 1232       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 1233       // zero offset We adjust the destination by -1 which means we
 1234       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1235       //
 1236       // When backwards copyng we need to store 1 word, 3 pairs and
 1237       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1238       // offsets {1, 3, 5, 7, 8} * unit.
 1239 
 1240       if (direction == copy_forwards) {
 1241         __ sub(s, s, 16);
 1242         __ sub(d, d, 8);
 1243       }
 1244 
 1245       // Fill 8 registers
 1246       //
 1247       // for forwards copy s was offset by -16 from the original input
 1248       // value of s so the register contents are at these offsets
 1249       // relative to the 64 bit block addressed by that original input
 1250       // and so on for each successive 64 byte block when s is updated
 1251       //
 1252       // t0 at offset 0,  t1 at offset 8
 1253       // t2 at offset 16, t3 at offset 24
 1254       // t4 at offset 32, t5 at offset 40
 1255       // t6 at offset 48, t7 at offset 56
 1256 
 1257       // for backwards copy s was not offset so the register contents
 1258       // are at these offsets into the preceding 64 byte block
 1259       // relative to that original input and so on for each successive
 1260       // preceding 64 byte block when s is updated. this explains the
 1261       // slightly counter-intuitive looking pattern of register usage
 1262       // in the stp instructions for backwards copy.
 1263       //
 1264       // t0 at offset -16, t1 at offset -8
 1265       // t2 at offset -32, t3 at offset -24
 1266       // t4 at offset -48, t5 at offset -40
 1267       // t6 at offset -64, t7 at offset -56
 1268 
 1269       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1270       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1271       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1272       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1273 
 1274       __ subs(count, count, 16);
 1275       __ br(Assembler::LO, drain);
 1276 
 1277       int prefetch = PrefetchCopyIntervalInBytes;
 1278       bool use_stride = false;
 1279       if (direction == copy_backwards) {
 1280         use_stride = prefetch > 256;
 1281         prefetch = -prefetch;
 1282         if (use_stride) __ mov(stride, prefetch);
 1283       }
 1284 
 1285       __ bind(again);
 1286 
 1287       if (PrefetchCopyIntervalInBytes > 0)
 1288         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1289 
 1290       if (direction == copy_forwards) {
 1291         // allowing for the offset of -8 the store instructions place
 1292         // registers into the target 64 bit block at the following
 1293         // offsets
 1294         //
 1295         // t0 at offset 0
 1296         // t1 at offset 8,  t2 at offset 16
 1297         // t3 at offset 24, t4 at offset 32
 1298         // t5 at offset 40, t6 at offset 48
 1299         // t7 at offset 56
 1300 
 1301         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1302         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1303         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1304         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1305         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1306         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1307         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1308         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1309         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1310       } else {
 1311         // d was not offset when we started so the registers are
 1312         // written into the 64 bit block preceding d with the following
 1313         // offsets
 1314         //
 1315         // t1 at offset -8
 1316         // t3 at offset -24, t0 at offset -16
 1317         // t5 at offset -48, t2 at offset -32
 1318         // t7 at offset -56, t4 at offset -48
 1319         //                   t6 at offset -64
 1320         //
 1321         // note that this matches the offsets previously noted for the
 1322         // loads
 1323 
 1324         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1325         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1326         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1327         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1328         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1329         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1330         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1331         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1332         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1333       }
 1334 
 1335       __ subs(count, count, 8);
 1336       __ br(Assembler::HS, again);
 1337 
 1338       // Drain
 1339       //
 1340       // this uses the same pattern of offsets and register arguments
 1341       // as above
 1342       __ bind(drain);
 1343       if (direction == copy_forwards) {
 1344         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1345         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1346         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1347         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1348         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1349       } else {
 1350         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1351         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1352         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1353         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1354         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1355       }
 1356       // now we need to copy any remaining part block which may
 1357       // include a 4 word block subblock and/or a 2 word subblock.
 1358       // bits 2 and 1 in the count are the tell-tale for whether we
 1359       // have each such subblock
 1360       {
 1361         Label L1, L2;
 1362         __ tbz(count, exact_log2(4), L1);
 1363         // this is the same as above but copying only 4 longs hence
 1364         // with only one intervening stp between the str instructions
 1365         // but note that the offsets and registers still follow the
 1366         // same pattern
 1367         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1368         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1369         if (direction == copy_forwards) {
 1370           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1371           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1372           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1373         } else {
 1374           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1375           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1376           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1377         }
 1378         __ bind(L1);
 1379 
 1380         __ tbz(count, 1, L2);
 1381         // this is the same as above but copying only 2 longs hence
 1382         // there is no intervening stp between the str instructions
 1383         // but note that the offset and register patterns are still
 1384         // the same
 1385         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1386         if (direction == copy_forwards) {
 1387           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1388           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1389         } else {
 1390           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1391           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1392         }
 1393         __ bind(L2);
 1394 
 1395         // for forwards copy we need to re-adjust the offsets we
 1396         // applied so that s and d are follow the last words written
 1397 
 1398         if (direction == copy_forwards) {
 1399           __ add(s, s, 16);
 1400           __ add(d, d, 8);
 1401         }
 1402 
 1403       }
 1404 
 1405       __ ret(lr);
 1406     }
 1407 
 1408     // record the stub entry and end
 1409     store_archive_data(stub_id, start, __ pc());
 1410 
 1411     return start;
 1412   }
 1413 
 1414   // Small copy: less than 16 bytes.
 1415   //
 1416   // NB: Ignores all of the bits of count which represent more than 15
 1417   // bytes, so a caller doesn't have to mask them.
 1418 
 1419   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1420     bool is_backwards = step < 0;
 1421     size_t granularity = g_uabs(step);
 1422     int direction = is_backwards ? -1 : 1;
 1423 
 1424     Label Lword, Lint, Lshort, Lbyte;
 1425 
 1426     assert(granularity
 1427            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1428 
 1429     const Register t0 = r3;
 1430     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1431     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1432 
 1433     // ??? I don't know if this bit-test-and-branch is the right thing
 1434     // to do.  It does a lot of jumping, resulting in several
 1435     // mispredicted branches.  It might make more sense to do this
 1436     // with something like Duff's device with a single computed branch.
 1437 
 1438     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1439     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1440     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1441     __ bind(Lword);
 1442 
 1443     if (granularity <= sizeof (jint)) {
 1444       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1445       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1446       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1447       __ bind(Lint);
 1448     }
 1449 
 1450     if (granularity <= sizeof (jshort)) {
 1451       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1452       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1453       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1454       __ bind(Lshort);
 1455     }
 1456 
 1457     if (granularity <= sizeof (jbyte)) {
 1458       __ tbz(count, 0, Lbyte);
 1459       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1460       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1461       __ bind(Lbyte);
 1462     }
 1463   }
 1464 
 1465   // All-singing all-dancing memory copy.
 1466   //
 1467   // Copy count units of memory from s to d.  The size of a unit is
 1468   // step, which can be positive or negative depending on the direction
 1469   // of copy.  If is_aligned is false, we align the source address.
 1470   //
 1471 
 1472   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1473                    Register s, Register d, Register count, int step) {
 1474     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1475     bool is_backwards = step < 0;
 1476     unsigned int granularity = g_uabs(step);
 1477     const Register t0 = r3, t1 = r4;
 1478 
 1479     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1480     // load all the data before writing anything
 1481     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1482     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1483     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1484     const Register send = r17, dend = r16;
 1485     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1486     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1487     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1488 
 1489     if (PrefetchCopyIntervalInBytes > 0)
 1490       __ prfm(Address(s, 0), PLDL1KEEP);
 1491     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1492     __ br(Assembler::HI, copy_big);
 1493 
 1494     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1495     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1496 
 1497     __ cmp(count, u1(16/granularity));
 1498     __ br(Assembler::LS, copy16);
 1499 
 1500     __ cmp(count, u1(64/granularity));
 1501     __ br(Assembler::HI, copy80);
 1502 
 1503     __ cmp(count, u1(32/granularity));
 1504     __ br(Assembler::LS, copy32);
 1505 
 1506     // 33..64 bytes
 1507     if (UseSIMDForMemoryOps) {
 1508       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1509       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1510       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1511       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1512     } else {
 1513       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1514       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1515       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1516       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1517 
 1518       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1519       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1520       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1521       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1522     }
 1523     __ b(finish);
 1524 
 1525     // 17..32 bytes
 1526     __ bind(copy32);
 1527     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1528     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1529 
 1530     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1531     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1532     __ b(finish);
 1533 
 1534     // 65..80/96 bytes
 1535     // (96 bytes if SIMD because we do 32 byes per instruction)
 1536     __ bind(copy80);
 1537     if (UseSIMDForMemoryOps) {
 1538       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1539       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1540       // Unaligned pointers can be an issue for copying.
 1541       // The issue has more chances to happen when granularity of data is
 1542       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1543       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1544       // The most performance drop has been seen for the range 65-80 bytes.
 1545       // For such cases using the pair of ldp/stp instead of the third pair of
 1546       // ldpq/stpq fixes the performance issue.
 1547       if (granularity < sizeof (jint)) {
 1548         Label copy96;
 1549         __ cmp(count, u1(80/granularity));
 1550         __ br(Assembler::HI, copy96);
 1551         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1552 
 1553         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1554         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1555 
 1556         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1557         __ b(finish);
 1558 
 1559         __ bind(copy96);
 1560       }
 1561       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1562 
 1563       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1564       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1565 
 1566       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1567     } else {
 1568       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1569       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1570       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1571       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1572       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1573 
 1574       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1575       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1576       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1577       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1578       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1579     }
 1580     __ b(finish);
 1581 
 1582     // 0..16 bytes
 1583     __ bind(copy16);
 1584     __ cmp(count, u1(8/granularity));
 1585     __ br(Assembler::LO, copy8);
 1586 
 1587     // 8..16 bytes
 1588     bs.copy_load_at_8(t0, Address(s, 0));
 1589     bs.copy_load_at_8(t1, Address(send, -8));
 1590     bs.copy_store_at_8(Address(d, 0), t0);
 1591     bs.copy_store_at_8(Address(dend, -8), t1);
 1592     __ b(finish);
 1593 
 1594     if (granularity < 8) {
 1595       // 4..7 bytes
 1596       __ bind(copy8);
 1597       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1598       __ ldrw(t0, Address(s, 0));
 1599       __ ldrw(t1, Address(send, -4));
 1600       __ strw(t0, Address(d, 0));
 1601       __ strw(t1, Address(dend, -4));
 1602       __ b(finish);
 1603       if (granularity < 4) {
 1604         // 0..3 bytes
 1605         __ bind(copy4);
 1606         __ cbz(count, finish); // get rid of 0 case
 1607         if (granularity == 2) {
 1608           __ ldrh(t0, Address(s, 0));
 1609           __ strh(t0, Address(d, 0));
 1610         } else { // granularity == 1
 1611           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1612           // the first and last byte.
 1613           // Handle the 3 byte case by loading and storing base + count/2
 1614           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1615           // This does means in the 1 byte case we load/store the same
 1616           // byte 3 times.
 1617           __ lsr(count, count, 1);
 1618           __ ldrb(t0, Address(s, 0));
 1619           __ ldrb(t1, Address(send, -1));
 1620           __ ldrb(t2, Address(s, count));
 1621           __ strb(t0, Address(d, 0));
 1622           __ strb(t1, Address(dend, -1));
 1623           __ strb(t2, Address(d, count));
 1624         }
 1625         __ b(finish);
 1626       }
 1627     }
 1628 
 1629     __ bind(copy_big);
 1630     if (is_backwards) {
 1631       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1632       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1633     }
 1634 
 1635     // Now we've got the small case out of the way we can align the
 1636     // source address on a 2-word boundary.
 1637 
 1638     // Here we will materialize a count in r15, which is used by copy_memory_small
 1639     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1640     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1641     // can not be used as a temp register, as it contains the count.
 1642 
 1643     Label aligned;
 1644 
 1645     if (is_aligned) {
 1646       // We may have to adjust by 1 word to get s 2-word-aligned.
 1647       __ tbz(s, exact_log2(wordSize), aligned);
 1648       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1649       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1650       __ sub(count, count, wordSize/granularity);
 1651     } else {
 1652       if (is_backwards) {
 1653         __ andr(r15, s, 2 * wordSize - 1);
 1654       } else {
 1655         __ neg(r15, s);
 1656         __ andr(r15, r15, 2 * wordSize - 1);
 1657       }
 1658       // r15 is the byte adjustment needed to align s.
 1659       __ cbz(r15, aligned);
 1660       int shift = exact_log2(granularity);
 1661       if (shift > 0) {
 1662         __ lsr(r15, r15, shift);
 1663       }
 1664       __ sub(count, count, r15);
 1665 
 1666 #if 0
 1667       // ?? This code is only correct for a disjoint copy.  It may or
 1668       // may not make sense to use it in that case.
 1669 
 1670       // Copy the first pair; s and d may not be aligned.
 1671       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1672       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1673 
 1674       // Align s and d, adjust count
 1675       if (is_backwards) {
 1676         __ sub(s, s, r15);
 1677         __ sub(d, d, r15);
 1678       } else {
 1679         __ add(s, s, r15);
 1680         __ add(d, d, r15);
 1681       }
 1682 #else
 1683       copy_memory_small(decorators, type, s, d, r15, step);
 1684 #endif
 1685     }
 1686 
 1687     __ bind(aligned);
 1688 
 1689     // s is now 2-word-aligned.
 1690 
 1691     // We have a count of units and some trailing bytes. Adjust the
 1692     // count and do a bulk copy of words. If the shift is zero
 1693     // perform a move instead to benefit from zero latency moves.
 1694     int shift = exact_log2(wordSize/granularity);
 1695     if (shift > 0) {
 1696       __ lsr(r15, count, shift);
 1697     } else {
 1698       __ mov(r15, count);
 1699     }
 1700     if (direction == copy_forwards) {
 1701       if (type != T_OBJECT) {
 1702         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
 1703         __ blr(rscratch1);
 1704       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1705         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
 1706         __ blr(rscratch1);
 1707       } else {
 1708         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
 1709         __ blr(rscratch1);
 1710       }
 1711     } else {
 1712       if (type != T_OBJECT) {
 1713         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
 1714         __ blr(rscratch1);
 1715       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1716         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
 1717         __ blr(rscratch1);
 1718       } else {
 1719         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
 1720         __ blr(rscratch1);
 1721       }
 1722     }
 1723 
 1724     // And the tail.
 1725     copy_memory_small(decorators, type, s, d, count, step);
 1726 
 1727     if (granularity >= 8) __ bind(copy8);
 1728     if (granularity >= 4) __ bind(copy4);
 1729     __ bind(finish);
 1730   }
 1731 
 1732 
 1733   void clobber_registers() {
 1734 #ifdef ASSERT
 1735     RegSet clobbered
 1736       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1737     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1738     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1739     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1740       __ mov(*it, rscratch1);
 1741     }
 1742 #endif
 1743 
 1744   }
 1745 
 1746   // Scan over array at a for count oops, verifying each one.
 1747   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1748   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1749     Label loop, end;
 1750     __ mov(rscratch1, a);
 1751     __ mov(rscratch2, zr);
 1752     __ bind(loop);
 1753     __ cmp(rscratch2, count);
 1754     __ br(Assembler::HS, end);
 1755     if (size == wordSize) {
 1756       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1757       __ verify_oop(temp);
 1758     } else {
 1759       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1760       __ decode_heap_oop(temp); // calls verify_oop
 1761     }
 1762     __ add(rscratch2, rscratch2, 1);
 1763     __ b(loop);
 1764     __ bind(end);
 1765   }
 1766 
 1767   // Arguments:
 1768   //   stub_id - is used to name the stub and identify all details of
 1769   //             how to perform the copy.
 1770   //
 1771   //   nopush_entry - is assigned to the stub's post push entry point
 1772   //                  unless it is null
 1773   //
 1774   // Inputs:
 1775   //   c_rarg0   - source array address
 1776   //   c_rarg1   - destination array address
 1777   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1778   //
 1779   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1780   // the hardware handle it.  The two dwords within qwords that span
 1781   // cache line boundaries will still be loaded and stored atomically.
 1782   //
 1783   // Side Effects: nopush_entry is set to the (post push) entry point
 1784   //               so it can be used by the corresponding conjoint
 1785   //               copy method
 1786   //
 1787   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1788     int size;
 1789     bool aligned;
 1790     bool is_oop;
 1791     bool dest_uninitialized;
 1792     switch (stub_id) {
 1793     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1794       size = sizeof(jbyte);
 1795       aligned = false;
 1796       is_oop = false;
 1797       dest_uninitialized = false;
 1798       break;
 1799     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1800       size = sizeof(jbyte);
 1801       aligned = true;
 1802       is_oop = false;
 1803       dest_uninitialized = false;
 1804       break;
 1805     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1806       size = sizeof(jshort);
 1807       aligned = false;
 1808       is_oop = false;
 1809       dest_uninitialized = false;
 1810       break;
 1811     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1812       size = sizeof(jshort);
 1813       aligned = true;
 1814       is_oop = false;
 1815       dest_uninitialized = false;
 1816       break;
 1817     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1818       size = sizeof(jint);
 1819       aligned = false;
 1820       is_oop = false;
 1821       dest_uninitialized = false;
 1822       break;
 1823     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1824       size = sizeof(jint);
 1825       aligned = true;
 1826       is_oop = false;
 1827       dest_uninitialized = false;
 1828       break;
 1829     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1830       // since this is always aligned we can (should!) use the same
 1831       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1832       ShouldNotReachHere();
 1833       break;
 1834     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1835       size = sizeof(jlong);
 1836       aligned = true;
 1837       is_oop = false;
 1838       dest_uninitialized = false;
 1839       break;
 1840     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1841       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1842       aligned = !UseCompressedOops;
 1843       is_oop = true;
 1844       dest_uninitialized = false;
 1845       break;
 1846     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1847       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1848       aligned = !UseCompressedOops;
 1849       is_oop = true;
 1850       dest_uninitialized = false;
 1851       break;
 1852     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1853       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1854       aligned = !UseCompressedOops;
 1855       is_oop = true;
 1856       dest_uninitialized = true;
 1857       break;
 1858     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1859       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1860       aligned = !UseCompressedOops;
 1861       is_oop = true;
 1862       dest_uninitialized = true;
 1863       break;
 1864     default:
 1865       ShouldNotReachHere();
 1866       break;
 1867     }
 1868     // all stubs provide a 2nd entry which omits the frame push for
 1869     // use when bailing out from a conjoint copy. However we may also
 1870     // need some extra addressses for memory access protection.
 1871     int entry_count = StubInfo::entry_count(stub_id);
 1872     assert(entry_count == 2, "sanity check");
 1873     assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
 1874 
 1875     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 1876     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 1877     GrowableArray<address> entries;
 1878     GrowableArray<address> extras;
 1879     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 1880     address start = load_archive_data(stub_id, &entries, extras_ptr);
 1881     if (start != nullptr) {
 1882       assert(entries.length() == entry_count - 1,
 1883              "unexpected entries count %d", entries.length());
 1884       *nopush_entry = entries.at(0);
 1885       assert(extras.length() == extra_count,
 1886              "unexpected extra count %d", extras.length());
 1887       if (add_extras) {
 1888         // register one handler at offset 0
 1889         register_unsafe_access_handlers(extras, 0, 1);
 1890       }
 1891       return start;
 1892     }
 1893 
 1894     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1895     RegSet saved_reg = RegSet::of(s, d, count);
 1896 
 1897     __ align(CodeEntryAlignment);
 1898     StubCodeMark mark(this, stub_id);
 1899     start = __ pc();
 1900     __ enter();
 1901 
 1902     *nopush_entry = __ pc();
 1903     entries.append(*nopush_entry);
 1904 
 1905     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1906     BLOCK_COMMENT("Post-Push Entry:");
 1907 
 1908     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1909     if (dest_uninitialized) {
 1910       decorators |= IS_DEST_UNINITIALIZED;
 1911     }
 1912     if (aligned) {
 1913       decorators |= ARRAYCOPY_ALIGNED;
 1914     }
 1915 
 1916     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1917     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1918 
 1919     if (is_oop) {
 1920       // save regs before copy_memory
 1921       __ push(RegSet::of(d, count), sp);
 1922     }
 1923     {
 1924       // UnsafeMemoryAccess page error: continue after unsafe access
 1925       UnsafeMemoryAccessMark umam(this, add_extras, true);
 1926       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1927     }
 1928 
 1929     if (is_oop) {
 1930       __ pop(RegSet::of(d, count), sp);
 1931       if (VerifyOops)
 1932         verify_oop_array(size, d, count, r16);
 1933     }
 1934 
 1935     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1936 
 1937     __ leave();
 1938     __ mov(r0, zr); // return 0
 1939     __ ret(lr);
 1940 
 1941     address end = __ pc();
 1942 
 1943     if (add_extras) {
 1944       // retrieve the registered handler addresses
 1945       retrieve_unsafe_access_handlers(start, end, extras);
 1946       assert(extras.length() == extra_count
 1947              , "incorrect handlers count %d", extras.length());
 1948     }
 1949 
 1950     // record the stub entry and end plus the no_push entry and any
 1951     // extra handler addresses
 1952     store_archive_data(stub_id, start, end, &entries, extras_ptr);
 1953 
 1954     return start;
 1955   }
 1956 
 1957   // Arguments:
 1958   //   stub_id - is used to name the stub and identify all details of
 1959   //             how to perform the copy.
 1960   //
 1961   //   nooverlap_target - identifes the (post push) entry for the
 1962   //             corresponding disjoint copy routine which can be
 1963   //             jumped to if the ranges do not actually overlap
 1964   //
 1965   //   nopush_entry - is assigned to the stub's post push entry point
 1966   //                  unless it is null
 1967   //
 1968   //
 1969   // Inputs:
 1970   //   c_rarg0   - source array address
 1971   //   c_rarg1   - destination array address
 1972   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1973   //
 1974   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1975   // the hardware handle it.  The two dwords within qwords that span
 1976   // cache line boundaries will still be loaded and stored atomically.
 1977   //
 1978   // Side Effects:
 1979   //   nopush_entry is set to the no-overlap entry point so it can be
 1980   //   used by some other conjoint copy method
 1981   //
 1982   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1983     int size;
 1984     bool aligned;
 1985     bool is_oop;
 1986     bool dest_uninitialized;
 1987     switch (stub_id) {
 1988     case StubId::stubgen_jbyte_arraycopy_id:
 1989       size = sizeof(jbyte);
 1990       aligned = false;
 1991       is_oop = false;
 1992       dest_uninitialized = false;
 1993       break;
 1994     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1995       size = sizeof(jbyte);
 1996       aligned = true;
 1997       is_oop = false;
 1998       dest_uninitialized = false;
 1999       break;
 2000     case StubId::stubgen_jshort_arraycopy_id:
 2001       size = sizeof(jshort);
 2002       aligned = false;
 2003       is_oop = false;
 2004       dest_uninitialized = false;
 2005       break;
 2006     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 2007       size = sizeof(jshort);
 2008       aligned = true;
 2009       is_oop = false;
 2010       dest_uninitialized = false;
 2011       break;
 2012     case StubId::stubgen_jint_arraycopy_id:
 2013       size = sizeof(jint);
 2014       aligned = false;
 2015       is_oop = false;
 2016       dest_uninitialized = false;
 2017       break;
 2018     case StubId::stubgen_arrayof_jint_arraycopy_id:
 2019       size = sizeof(jint);
 2020       aligned = true;
 2021       is_oop = false;
 2022       dest_uninitialized = false;
 2023       break;
 2024     case StubId::stubgen_jlong_arraycopy_id:
 2025       // since this is always aligned we can (should!) use the same
 2026       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 2027       ShouldNotReachHere();
 2028       break;
 2029     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 2030       size = sizeof(jlong);
 2031       aligned = true;
 2032       is_oop = false;
 2033       dest_uninitialized = false;
 2034       break;
 2035     case StubId::stubgen_oop_arraycopy_id:
 2036       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2037       aligned = !UseCompressedOops;
 2038       is_oop = true;
 2039       dest_uninitialized = false;
 2040       break;
 2041     case StubId::stubgen_arrayof_oop_arraycopy_id:
 2042       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2043       aligned = !UseCompressedOops;
 2044       is_oop = true;
 2045       dest_uninitialized = false;
 2046       break;
 2047     case StubId::stubgen_oop_arraycopy_uninit_id:
 2048       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2049       aligned = !UseCompressedOops;
 2050       is_oop = true;
 2051       dest_uninitialized = true;
 2052       break;
 2053     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 2054       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2055       aligned = !UseCompressedOops;
 2056       is_oop = true;
 2057       dest_uninitialized = true;
 2058       break;
 2059     default:
 2060       ShouldNotReachHere();
 2061     }
 2062     // only some conjoint stubs generate a 2nd entry
 2063     int entry_count = StubInfo::entry_count(stub_id);
 2064     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2065     assert(entry_count == expected_entry_count,
 2066            "expected entry count %d does not match declared entry count %d for stub %s",
 2067            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2068 
 2069     // We need to protect memory accesses in certain cases
 2070     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 2071     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 2072     GrowableArray<address> entries;
 2073     GrowableArray<address> extras;
 2074     GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
 2075     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 2076     address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 2077     if (start != nullptr) {
 2078       assert(entries.length() == expected_entry_count - 1,
 2079              "unexpected entries count %d", entries.length());
 2080       assert(extras.length() == extra_count,
 2081              "unexpected extra count %d", extras.length());
 2082       if (nopush_entry != nullptr) {
 2083         *nopush_entry = entries.at(0);
 2084       }
 2085       if (add_extras) {
 2086         // register one handler at offset 0
 2087         register_unsafe_access_handlers(extras, 0, 1);
 2088       }
 2089       return start;
 2090     }
 2091 
 2092     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2093     RegSet saved_regs = RegSet::of(s, d, count);
 2094     StubCodeMark mark(this, stub_id);
 2095     start = __ pc();
 2096     __ enter();
 2097 
 2098     if (nopush_entry != nullptr) {
 2099       *nopush_entry = __ pc();
 2100       entries.append(*nopush_entry);
 2101       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 2102       BLOCK_COMMENT("Post-Push Entry:");
 2103     }
 2104 
 2105     // use fwd copy when (d-s) above_equal (count*size)
 2106     Label L_overlapping;
 2107     __ sub(rscratch1, d, s);
 2108     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 2109     __ br(Assembler::LO, L_overlapping);
 2110     __ b(RuntimeAddress(nooverlap_target));
 2111     __ bind(L_overlapping);
 2112 
 2113     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 2114     if (dest_uninitialized) {
 2115       decorators |= IS_DEST_UNINITIALIZED;
 2116     }
 2117     if (aligned) {
 2118       decorators |= ARRAYCOPY_ALIGNED;
 2119     }
 2120 
 2121     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2122     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 2123 
 2124     if (is_oop) {
 2125       // save regs before copy_memory
 2126       __ push(RegSet::of(d, count), sp);
 2127     }
 2128     {
 2129       // UnsafeMemoryAccess page error: continue after unsafe access
 2130       UnsafeMemoryAccessMark umam(this, add_extras, true);
 2131       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 2132     }
 2133     if (is_oop) {
 2134       __ pop(RegSet::of(d, count), sp);
 2135       if (VerifyOops)
 2136         verify_oop_array(size, d, count, r16);
 2137     }
 2138     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 2139     __ leave();
 2140     __ mov(r0, zr); // return 0
 2141     __ ret(lr);
 2142 
 2143     assert(entries.length() == expected_entry_count - 1,
 2144            "unexpected entries count %d", entries.length());
 2145 
 2146     address end = __ pc();
 2147 
 2148     if (add_extras) {
 2149       // retrieve the registered handler addresses
 2150       retrieve_unsafe_access_handlers(start, end, extras);
 2151       assert(extras.length() == extra_count,
 2152              "incorrect handlers count %d", extras.length());
 2153     }
 2154 
 2155     // record the stub entry and end plus any no_push entry and/or
 2156     // extra handler addresses
 2157     store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
 2158 
 2159     return start;
 2160   }
 2161 
 2162   // Helper for generating a dynamic type check.
 2163   // Smashes rscratch1, rscratch2.
 2164   void generate_type_check(Register sub_klass,
 2165                            Register super_check_offset,
 2166                            Register super_klass,
 2167                            Register temp1,
 2168                            Register temp2,
 2169                            Register result,
 2170                            Label& L_success) {
 2171     assert_different_registers(sub_klass, super_check_offset, super_klass);
 2172 
 2173     BLOCK_COMMENT("type_check:");
 2174 
 2175     Label L_miss;
 2176 
 2177     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 2178                                      super_check_offset);
 2179     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 2180 
 2181     // Fall through on failure!
 2182     __ BIND(L_miss);
 2183   }
 2184 
 2185   //
 2186   //  Generate checkcasting array copy stub
 2187   //
 2188   //  Input:
 2189   //    c_rarg0   - source array address
 2190   //    c_rarg1   - destination array address
 2191   //    c_rarg2   - element count, treated as ssize_t, can be zero
 2192   //    c_rarg3   - size_t ckoff (super_check_offset)
 2193   //    c_rarg4   - oop ckval (super_klass)
 2194   //
 2195   //  Output:
 2196   //    r0 ==  0  -  success
 2197   //    r0 == -1^K - failure, where K is partial transfer count
 2198   //
 2199   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 2200     bool dest_uninitialized;
 2201     switch (stub_id) {
 2202     case StubId::stubgen_checkcast_arraycopy_id:
 2203       dest_uninitialized = false;
 2204       break;
 2205     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 2206       dest_uninitialized = true;
 2207       break;
 2208     default:
 2209       ShouldNotReachHere();
 2210     }
 2211 
 2212     // The normal stub provides a 2nd entry which omits the frame push
 2213     // for use when bailing out from a disjoint copy.
 2214     // Only some conjoint stubs generate a 2nd entry
 2215     int entry_count = StubInfo::entry_count(stub_id);
 2216     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2217     GrowableArray<address> entries;
 2218     GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
 2219     assert(entry_count == expected_entry_count,
 2220            "expected entry count %d does not match declared entry count %d for stub %s",
 2221            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2222     address start = load_archive_data(stub_id, entries_ptr);
 2223     if (start != nullptr) {
 2224       assert(entries.length() + 1 == expected_entry_count,
 2225              "expected entry count %d does not match return entry count %d for stub %s",
 2226              expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
 2227       if (nopush_entry != nullptr) {
 2228         *nopush_entry = entries.at(0);
 2229       }
 2230       return start;
 2231     }
 2232 
 2233     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 2234 
 2235     // Input registers (after setup_arg_regs)
 2236     const Register from        = c_rarg0;   // source array address
 2237     const Register to          = c_rarg1;   // destination array address
 2238     const Register count       = c_rarg2;   // elementscount
 2239     const Register ckoff       = c_rarg3;   // super_check_offset
 2240     const Register ckval       = c_rarg4;   // super_klass
 2241 
 2242     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 2243 
 2244     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 2245     const Register copied_oop  = r22;       // actual oop copied
 2246     const Register count_save  = r21;       // orig elementscount
 2247     const Register start_to    = r20;       // destination array start address
 2248     const Register r19_klass   = r19;       // oop._klass
 2249 
 2250     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 2251     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 2252 
 2253     //---------------------------------------------------------------
 2254     // Assembler stub will be used for this call to arraycopy
 2255     // if the two arrays are subtypes of Object[] but the
 2256     // destination array type is not equal to or a supertype
 2257     // of the source type.  Each element must be separately
 2258     // checked.
 2259 
 2260     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 2261                                copied_oop, r19_klass, count_save);
 2262 
 2263     __ align(CodeEntryAlignment);
 2264     StubCodeMark mark(this, stub_id);
 2265     start = __ pc();
 2266 
 2267     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2268 
 2269 #ifdef ASSERT
 2270     // caller guarantees that the arrays really are different
 2271     // otherwise, we would have to make conjoint checks
 2272     { Label L;
 2273       __ b(L);                  // conjoint check not yet implemented
 2274       __ stop("checkcast_copy within a single array");
 2275       __ bind(L);
 2276     }
 2277 #endif //ASSERT
 2278 
 2279     // Caller of this entry point must set up the argument registers.
 2280     if (nopush_entry != nullptr) {
 2281       *nopush_entry = __ pc();
 2282       entries.append(*nopush_entry);
 2283       BLOCK_COMMENT("Entry:");
 2284     }
 2285 
 2286      // Empty array:  Nothing to do.
 2287     __ cbz(count, L_done);
 2288     __ push(RegSet::of(r19, r20, r21, r22), sp);
 2289 
 2290 #ifdef ASSERT
 2291     BLOCK_COMMENT("assert consistent ckoff/ckval");
 2292     // The ckoff and ckval must be mutually consistent,
 2293     // even though caller generates both.
 2294     { Label L;
 2295       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2296       __ ldrw(start_to, Address(ckval, sco_offset));
 2297       __ cmpw(ckoff, start_to);
 2298       __ br(Assembler::EQ, L);
 2299       __ stop("super_check_offset inconsistent");
 2300       __ bind(L);
 2301     }
 2302 #endif //ASSERT
 2303 
 2304     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 2305     bool is_oop = true;
 2306     int element_size = UseCompressedOops ? 4 : 8;
 2307     if (dest_uninitialized) {
 2308       decorators |= IS_DEST_UNINITIALIZED;
 2309     }
 2310 
 2311     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2312     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 2313 
 2314     // save the original count
 2315     __ mov(count_save, count);
 2316 
 2317     // Copy from low to high addresses
 2318     __ mov(start_to, to);              // Save destination array start address
 2319     __ b(L_load_element);
 2320 
 2321     // ======== begin loop ========
 2322     // (Loop is rotated; its entry is L_load_element.)
 2323     // Loop control:
 2324     //   for (; count != 0; count--) {
 2325     //     copied_oop = load_heap_oop(from++);
 2326     //     ... generate_type_check ...;
 2327     //     store_heap_oop(to++, copied_oop);
 2328     //   }
 2329     __ align(OptoLoopAlignment);
 2330 
 2331     __ BIND(L_store_element);
 2332     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 2333                       __ post(to, element_size), copied_oop, noreg,
 2334                       gct1, gct2, gct3);
 2335     __ sub(count, count, 1);
 2336     __ cbz(count, L_do_card_marks);
 2337 
 2338     // ======== loop entry is here ========
 2339     __ BIND(L_load_element);
 2340     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 2341                      copied_oop, noreg, __ post(from, element_size),
 2342                      gct1);
 2343     __ cbz(copied_oop, L_store_element);
 2344 
 2345     __ load_klass(r19_klass, copied_oop);// query the object klass
 2346 
 2347     BLOCK_COMMENT("type_check:");
 2348     generate_type_check(/*sub_klass*/r19_klass,
 2349                         /*super_check_offset*/ckoff,
 2350                         /*super_klass*/ckval,
 2351                         /*r_array_base*/gct1,
 2352                         /*temp2*/gct2,
 2353                         /*result*/r10, L_store_element);
 2354 
 2355     // Fall through on failure!
 2356 
 2357     // ======== end loop ========
 2358 
 2359     // It was a real error; we must depend on the caller to finish the job.
 2360     // Register count = remaining oops, count_orig = total oops.
 2361     // Emit GC store barriers for the oops we have copied and report
 2362     // their number to the caller.
 2363 
 2364     __ subs(count, count_save, count);     // K = partially copied oop count
 2365     __ eon(count, count, zr);              // report (-1^K) to caller
 2366     __ br(Assembler::EQ, L_done_pop);
 2367 
 2368     __ BIND(L_do_card_marks);
 2369     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2370 
 2371     __ bind(L_done_pop);
 2372     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2373     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2374 
 2375     __ bind(L_done);
 2376     __ mov(r0, count);
 2377     __ leave();
 2378     __ ret(lr);
 2379 
 2380     // record the stub entry and end plus any no_push entry
 2381     store_archive_data(stub_id, start, __ pc() , entries_ptr);
 2382     return start;
 2383   }
 2384 
 2385   // Perform range checks on the proposed arraycopy.
 2386   // Kills temp, but nothing else.
 2387   // Also, clean the sign bits of src_pos and dst_pos.
 2388   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2389                               Register src_pos, // source position (c_rarg1)
 2390                               Register dst,     // destination array oo (c_rarg2)
 2391                               Register dst_pos, // destination position (c_rarg3)
 2392                               Register length,
 2393                               Register temp,
 2394                               Label& L_failed) {
 2395     BLOCK_COMMENT("arraycopy_range_checks:");
 2396 
 2397     assert_different_registers(rscratch1, temp);
 2398 
 2399     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2400     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2401     __ addw(temp, length, src_pos);
 2402     __ cmpw(temp, rscratch1);
 2403     __ br(Assembler::HI, L_failed);
 2404 
 2405     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2406     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2407     __ addw(temp, length, dst_pos);
 2408     __ cmpw(temp, rscratch1);
 2409     __ br(Assembler::HI, L_failed);
 2410 
 2411     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2412     __ movw(src_pos, src_pos);
 2413     __ movw(dst_pos, dst_pos);
 2414 
 2415     BLOCK_COMMENT("arraycopy_range_checks done");
 2416   }
 2417 
 2418   // These stubs get called from some dumb test routine.
 2419   // I'll write them properly when they're called from
 2420   // something that's actually doing something.
 2421   static void fake_arraycopy_stub(address src, address dst, int count) {
 2422     assert(count == 0, "huh?");
 2423   }
 2424 
 2425 
 2426   //
 2427   //  Generate 'unsafe' array copy stub
 2428   //  Though just as safe as the other stubs, it takes an unscaled
 2429   //  size_t argument instead of an element count.
 2430   //
 2431   //  Input:
 2432   //    c_rarg0   - source array address
 2433   //    c_rarg1   - destination array address
 2434   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2435   //
 2436   // Examines the alignment of the operands and dispatches
 2437   // to a long, int, short, or byte copy loop.
 2438   //
 2439   address generate_unsafe_copy(address byte_copy_entry,
 2440                                address short_copy_entry,
 2441                                address int_copy_entry,
 2442                                address long_copy_entry) {
 2443     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2444     int entry_count = StubInfo::entry_count(stub_id);
 2445     assert(entry_count == 1, "sanity check");
 2446     address start = load_archive_data(stub_id);
 2447     if (start != nullptr) {
 2448       return start;
 2449     }
 2450     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2451     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2452 
 2453     __ align(CodeEntryAlignment);
 2454     StubCodeMark mark(this, stub_id);
 2455     start = __ pc();
 2456     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2457 
 2458     // bump this on entry, not on exit:
 2459     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2460 
 2461     __ orr(rscratch1, s, d);
 2462     __ orr(rscratch1, rscratch1, count);
 2463 
 2464     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2465     __ cbz(rscratch1, L_long_aligned);
 2466     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2467     __ cbz(rscratch1, L_int_aligned);
 2468     __ tbz(rscratch1, 0, L_short_aligned);
 2469     __ b(RuntimeAddress(byte_copy_entry));
 2470 
 2471     __ BIND(L_short_aligned);
 2472     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2473     __ b(RuntimeAddress(short_copy_entry));
 2474     __ BIND(L_int_aligned);
 2475     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2476     __ b(RuntimeAddress(int_copy_entry));
 2477     __ BIND(L_long_aligned);
 2478     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2479     __ b(RuntimeAddress(long_copy_entry));
 2480 
 2481     // record the stub entry and end
 2482     store_archive_data(stub_id, start, __ pc());
 2483 
 2484     return start;
 2485   }
 2486 
 2487   //
 2488   //  Generate generic array copy stubs
 2489   //
 2490   //  Input:
 2491   //    c_rarg0    -  src oop
 2492   //    c_rarg1    -  src_pos (32-bits)
 2493   //    c_rarg2    -  dst oop
 2494   //    c_rarg3    -  dst_pos (32-bits)
 2495   //    c_rarg4    -  element count (32-bits)
 2496   //
 2497   //  Output:
 2498   //    r0 ==  0  -  success
 2499   //    r0 == -1^K - failure, where K is partial transfer count
 2500   //
 2501   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2502                                 address int_copy_entry, address oop_copy_entry,
 2503                                 address long_copy_entry, address checkcast_copy_entry) {
 2504     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2505     int entry_count = StubInfo::entry_count(stub_id);
 2506     assert(entry_count == 1, "sanity check");
 2507     address start = load_archive_data(stub_id);
 2508     if (start != nullptr) {
 2509       return start;
 2510     }
 2511     Label L_failed, L_objArray;
 2512     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2513 
 2514     // Input registers
 2515     const Register src        = c_rarg0;  // source array oop
 2516     const Register src_pos    = c_rarg1;  // source position
 2517     const Register dst        = c_rarg2;  // destination array oop
 2518     const Register dst_pos    = c_rarg3;  // destination position
 2519     const Register length     = c_rarg4;
 2520 
 2521 
 2522     // Registers used as temps
 2523     const Register dst_klass  = c_rarg5;
 2524 
 2525     __ align(CodeEntryAlignment);
 2526 
 2527     StubCodeMark mark(this, stub_id);
 2528 
 2529     start = __ pc();
 2530 
 2531     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2532 
 2533     // bump this on entry, not on exit:
 2534     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2535 
 2536     //-----------------------------------------------------------------------
 2537     // Assembler stub will be used for this call to arraycopy
 2538     // if the following conditions are met:
 2539     //
 2540     // (1) src and dst must not be null.
 2541     // (2) src_pos must not be negative.
 2542     // (3) dst_pos must not be negative.
 2543     // (4) length  must not be negative.
 2544     // (5) src klass and dst klass should be the same and not null.
 2545     // (6) src and dst should be arrays.
 2546     // (7) src_pos + length must not exceed length of src.
 2547     // (8) dst_pos + length must not exceed length of dst.
 2548     //
 2549 
 2550     //  if (src == nullptr) return -1;
 2551     __ cbz(src, L_failed);
 2552 
 2553     //  if (src_pos < 0) return -1;
 2554     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2555 
 2556     //  if (dst == nullptr) return -1;
 2557     __ cbz(dst, L_failed);
 2558 
 2559     //  if (dst_pos < 0) return -1;
 2560     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2561 
 2562     // registers used as temp
 2563     const Register scratch_length    = r16; // elements count to copy
 2564     const Register scratch_src_klass = r17; // array klass
 2565     const Register lh                = r15; // layout helper
 2566 
 2567     //  if (length < 0) return -1;
 2568     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2569     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2570 
 2571     __ load_klass(scratch_src_klass, src);
 2572 #ifdef ASSERT
 2573     //  assert(src->klass() != nullptr);
 2574     {
 2575       BLOCK_COMMENT("assert klasses not null {");
 2576       Label L1, L2;
 2577       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2578       __ bind(L1);
 2579       __ stop("broken null klass");
 2580       __ bind(L2);
 2581       __ load_klass(rscratch1, dst);
 2582       __ cbz(rscratch1, L1);     // this would be broken also
 2583       BLOCK_COMMENT("} assert klasses not null done");
 2584     }
 2585 #endif
 2586 
 2587     // Load layout helper (32-bits)
 2588     //
 2589     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2590     // 32        30    24            16              8     2                 0
 2591     //
 2592     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2593     //
 2594 
 2595     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2596 
 2597     // Handle objArrays completely differently...
 2598     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2599     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2600     __ movw(rscratch1, objArray_lh);
 2601     __ eorw(rscratch2, lh, rscratch1);
 2602     __ cbzw(rscratch2, L_objArray);
 2603 
 2604     //  if (src->klass() != dst->klass()) return -1;
 2605     __ load_klass(rscratch2, dst);
 2606     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2607     __ cbnz(rscratch2, L_failed);
 2608 
 2609     //  if (!src->is_Array()) return -1;
 2610     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2611 
 2612     // At this point, it is known to be a typeArray (array_tag 0x3).
 2613 #ifdef ASSERT
 2614     {
 2615       BLOCK_COMMENT("assert primitive array {");
 2616       Label L;
 2617       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2618       __ cmpw(lh, rscratch2);
 2619       __ br(Assembler::GE, L);
 2620       __ stop("must be a primitive array");
 2621       __ bind(L);
 2622       BLOCK_COMMENT("} assert primitive array done");
 2623     }
 2624 #endif
 2625 
 2626     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2627                            rscratch2, L_failed);
 2628 
 2629     // TypeArrayKlass
 2630     //
 2631     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2632     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2633     //
 2634 
 2635     const Register rscratch1_offset = rscratch1;    // array offset
 2636     const Register r15_elsize = lh; // element size
 2637 
 2638     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2639            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2640     __ add(src, src, rscratch1_offset);           // src array offset
 2641     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2642     BLOCK_COMMENT("choose copy loop based on element size");
 2643 
 2644     // next registers should be set before the jump to corresponding stub
 2645     const Register from     = c_rarg0;  // source array address
 2646     const Register to       = c_rarg1;  // destination array address
 2647     const Register count    = c_rarg2;  // elements count
 2648 
 2649     // 'from', 'to', 'count' registers should be set in such order
 2650     // since they are the same as 'src', 'src_pos', 'dst'.
 2651 
 2652     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2653 
 2654     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2655     // size in bytes).  We do a simple bitwise binary search.
 2656   __ BIND(L_copy_bytes);
 2657     __ tbnz(r15_elsize, 1, L_copy_ints);
 2658     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2659     __ lea(from, Address(src, src_pos));// src_addr
 2660     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2661     __ movw(count, scratch_length); // length
 2662     __ b(RuntimeAddress(byte_copy_entry));
 2663 
 2664   __ BIND(L_copy_shorts);
 2665     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2666     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2667     __ movw(count, scratch_length); // length
 2668     __ b(RuntimeAddress(short_copy_entry));
 2669 
 2670   __ BIND(L_copy_ints);
 2671     __ tbnz(r15_elsize, 0, L_copy_longs);
 2672     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2673     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2674     __ movw(count, scratch_length); // length
 2675     __ b(RuntimeAddress(int_copy_entry));
 2676 
 2677   __ BIND(L_copy_longs);
 2678 #ifdef ASSERT
 2679     {
 2680       BLOCK_COMMENT("assert long copy {");
 2681       Label L;
 2682       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2683       __ cmpw(r15_elsize, LogBytesPerLong);
 2684       __ br(Assembler::EQ, L);
 2685       __ stop("must be long copy, but elsize is wrong");
 2686       __ bind(L);
 2687       BLOCK_COMMENT("} assert long copy done");
 2688     }
 2689 #endif
 2690     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2691     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2692     __ movw(count, scratch_length); // length
 2693     __ b(RuntimeAddress(long_copy_entry));
 2694 
 2695     // ObjArrayKlass
 2696   __ BIND(L_objArray);
 2697     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2698 
 2699     Label L_plain_copy, L_checkcast_copy;
 2700     //  test array classes for subtyping
 2701     __ load_klass(r15, dst);
 2702     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2703     __ br(Assembler::NE, L_checkcast_copy);
 2704 
 2705     // Identically typed arrays can be copied without element-wise checks.
 2706     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2707                            rscratch2, L_failed);
 2708 
 2709     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2710     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2711     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2712     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2713     __ movw(count, scratch_length); // length
 2714   __ BIND(L_plain_copy);
 2715     __ b(RuntimeAddress(oop_copy_entry));
 2716 
 2717   __ BIND(L_checkcast_copy);
 2718     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2719     {
 2720       // Before looking at dst.length, make sure dst is also an objArray.
 2721       __ ldrw(rscratch1, Address(r15, lh_offset));
 2722       __ movw(rscratch2, objArray_lh);
 2723       __ eorw(rscratch1, rscratch1, rscratch2);
 2724       __ cbnzw(rscratch1, L_failed);
 2725 
 2726       // It is safe to examine both src.length and dst.length.
 2727       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2728                              r15, L_failed);
 2729 
 2730       __ load_klass(dst_klass, dst); // reload
 2731 
 2732       // Marshal the base address arguments now, freeing registers.
 2733       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2734       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2735       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2736       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2737       __ movw(count, length);           // length (reloaded)
 2738       Register sco_temp = c_rarg3;      // this register is free now
 2739       assert_different_registers(from, to, count, sco_temp,
 2740                                  dst_klass, scratch_src_klass);
 2741       // assert_clean_int(count, sco_temp);
 2742 
 2743       // Generate the type check.
 2744       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2745       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2746 
 2747       // Smashes rscratch1, rscratch2
 2748       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2749                           L_plain_copy);
 2750 
 2751       // Fetch destination element klass from the ObjArrayKlass header.
 2752       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2753       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2754       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2755 
 2756       // the checkcast_copy loop needs two extra arguments:
 2757       assert(c_rarg3 == sco_temp, "#3 already in place");
 2758       // Set up arguments for checkcast_copy_entry.
 2759       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2760       __ b(RuntimeAddress(checkcast_copy_entry));
 2761     }
 2762 
 2763   __ BIND(L_failed);
 2764     __ mov(r0, -1);
 2765     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2766     __ ret(lr);
 2767 
 2768     // record the stub entry and end
 2769     store_archive_data(stub_id, start, __ pc());
 2770 
 2771     return start;
 2772   }
 2773 
 2774   //
 2775   // Generate stub for array fill. If "aligned" is true, the
 2776   // "to" address is assumed to be heapword aligned.
 2777   //
 2778   // Arguments for generated stub:
 2779   //   to:    c_rarg0
 2780   //   value: c_rarg1
 2781   //   count: c_rarg2 treated as signed
 2782   //
 2783   address generate_fill(StubId stub_id) {
 2784     BasicType t;
 2785     bool aligned;
 2786 
 2787     switch (stub_id) {
 2788     case StubId::stubgen_jbyte_fill_id:
 2789       t = T_BYTE;
 2790       aligned = false;
 2791       break;
 2792     case StubId::stubgen_jshort_fill_id:
 2793       t = T_SHORT;
 2794       aligned = false;
 2795       break;
 2796     case StubId::stubgen_jint_fill_id:
 2797       t = T_INT;
 2798       aligned = false;
 2799       break;
 2800     case StubId::stubgen_arrayof_jbyte_fill_id:
 2801       t = T_BYTE;
 2802       aligned = true;
 2803       break;
 2804     case StubId::stubgen_arrayof_jshort_fill_id:
 2805       t = T_SHORT;
 2806       aligned = true;
 2807       break;
 2808     case StubId::stubgen_arrayof_jint_fill_id:
 2809       t = T_INT;
 2810       aligned = true;
 2811       break;
 2812     default:
 2813       ShouldNotReachHere();
 2814     };
 2815     int entry_count = StubInfo::entry_count(stub_id);
 2816     assert(entry_count == 1, "sanity check");
 2817     address start = load_archive_data(stub_id);
 2818     if (start != nullptr) {
 2819       return start;
 2820     }
 2821     __ align(CodeEntryAlignment);
 2822     StubCodeMark mark(this, stub_id);
 2823     start = __ pc();
 2824 
 2825     BLOCK_COMMENT("Entry:");
 2826 
 2827     const Register to        = c_rarg0;  // source array address
 2828     const Register value     = c_rarg1;  // value
 2829     const Register count     = c_rarg2;  // elements count
 2830 
 2831     const Register bz_base = r10;        // base for block_zero routine
 2832     const Register cnt_words = r11;      // temp register
 2833 
 2834     __ enter();
 2835 
 2836     Label L_fill_elements, L_exit1;
 2837 
 2838     int shift = -1;
 2839     switch (t) {
 2840       case T_BYTE:
 2841         shift = 0;
 2842         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2843         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2844         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2845         __ br(Assembler::LO, L_fill_elements);
 2846         break;
 2847       case T_SHORT:
 2848         shift = 1;
 2849         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2850         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2851         __ br(Assembler::LO, L_fill_elements);
 2852         break;
 2853       case T_INT:
 2854         shift = 2;
 2855         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2856         __ br(Assembler::LO, L_fill_elements);
 2857         break;
 2858       default: ShouldNotReachHere();
 2859     }
 2860 
 2861     // Align source address at 8 bytes address boundary.
 2862     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2863     if (!aligned) {
 2864       switch (t) {
 2865         case T_BYTE:
 2866           // One byte misalignment happens only for byte arrays.
 2867           __ tbz(to, 0, L_skip_align1);
 2868           __ strb(value, Address(__ post(to, 1)));
 2869           __ subw(count, count, 1);
 2870           __ bind(L_skip_align1);
 2871           // Fallthrough
 2872         case T_SHORT:
 2873           // Two bytes misalignment happens only for byte and short (char) arrays.
 2874           __ tbz(to, 1, L_skip_align2);
 2875           __ strh(value, Address(__ post(to, 2)));
 2876           __ subw(count, count, 2 >> shift);
 2877           __ bind(L_skip_align2);
 2878           // Fallthrough
 2879         case T_INT:
 2880           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2881           __ tbz(to, 2, L_skip_align4);
 2882           __ strw(value, Address(__ post(to, 4)));
 2883           __ subw(count, count, 4 >> shift);
 2884           __ bind(L_skip_align4);
 2885           break;
 2886         default: ShouldNotReachHere();
 2887       }
 2888     }
 2889 
 2890     //
 2891     //  Fill large chunks
 2892     //
 2893     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2894     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2895     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2896     if (UseBlockZeroing) {
 2897       Label non_block_zeroing, rest;
 2898       // If the fill value is zero we can use the fast zero_words().
 2899       __ cbnz(value, non_block_zeroing);
 2900       __ mov(bz_base, to);
 2901       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2902       address tpc = __ zero_words(bz_base, cnt_words);
 2903       if (tpc == nullptr) {
 2904         fatal("CodeCache is full at generate_fill");
 2905       }
 2906       __ b(rest);
 2907       __ bind(non_block_zeroing);
 2908       __ fill_words(to, cnt_words, value);
 2909       __ bind(rest);
 2910     } else {
 2911       __ fill_words(to, cnt_words, value);
 2912     }
 2913 
 2914     // Remaining count is less than 8 bytes. Fill it by a single store.
 2915     // Note that the total length is no less than 8 bytes.
 2916     if (t == T_BYTE || t == T_SHORT) {
 2917       Label L_exit1;
 2918       __ cbzw(count, L_exit1);
 2919       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2920       __ str(value, Address(to, -8));    // overwrite some elements
 2921       __ bind(L_exit1);
 2922       __ leave();
 2923       __ ret(lr);
 2924     }
 2925 
 2926     // Handle copies less than 8 bytes.
 2927     Label L_fill_2, L_fill_4, L_exit2;
 2928     __ bind(L_fill_elements);
 2929     switch (t) {
 2930       case T_BYTE:
 2931         __ tbz(count, 0, L_fill_2);
 2932         __ strb(value, Address(__ post(to, 1)));
 2933         __ bind(L_fill_2);
 2934         __ tbz(count, 1, L_fill_4);
 2935         __ strh(value, Address(__ post(to, 2)));
 2936         __ bind(L_fill_4);
 2937         __ tbz(count, 2, L_exit2);
 2938         __ strw(value, Address(to));
 2939         break;
 2940       case T_SHORT:
 2941         __ tbz(count, 0, L_fill_4);
 2942         __ strh(value, Address(__ post(to, 2)));
 2943         __ bind(L_fill_4);
 2944         __ tbz(count, 1, L_exit2);
 2945         __ strw(value, Address(to));
 2946         break;
 2947       case T_INT:
 2948         __ cbzw(count, L_exit2);
 2949         __ strw(value, Address(to));
 2950         break;
 2951       default: ShouldNotReachHere();
 2952     }
 2953     __ bind(L_exit2);
 2954     __ leave();
 2955     __ ret(lr);
 2956 
 2957     // record the stub entry and end
 2958     store_archive_data(stub_id, start, __ pc());
 2959 
 2960     return start;
 2961   }
 2962 
 2963   address generate_unsafecopy_common_error_exit() {
 2964     StubId stub_id = StubId::stubgen_unsafecopy_common_id;
 2965     int entry_count = StubInfo::entry_count(stub_id);
 2966     assert(entry_count == 1, "sanity check");
 2967     address start = load_archive_data(stub_id);
 2968     if (start != nullptr) {
 2969       return start;
 2970     }
 2971     __ align(CodeEntryAlignment);
 2972     StubCodeMark mark(this, stub_id);
 2973     start = __ pc();
 2974       __ leave();
 2975       __ mov(r0, 0);
 2976       __ ret(lr);
 2977 
 2978     // record the stub entry and end
 2979     store_archive_data(stub_id, start, __ pc());
 2980 
 2981     return start;
 2982   }
 2983 
 2984   //
 2985   //  Generate 'unsafe' set memory stub
 2986   //  Though just as safe as the other stubs, it takes an unscaled
 2987   //  size_t (# bytes) argument instead of an element count.
 2988   //
 2989   //  This fill operation is atomicity preserving: as long as the
 2990   //  address supplied is sufficiently aligned, all writes of up to 64
 2991   //  bits in size are single-copy atomic.
 2992   //
 2993   //  Input:
 2994   //    c_rarg0   - destination array address
 2995   //    c_rarg1   - byte count (size_t)
 2996   //    c_rarg2   - byte value
 2997   //
 2998   address generate_unsafe_setmemory() {
 2999     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
 3000     int entry_count = StubInfo::entry_count(stub_id);
 3001     assert(entry_count == 1, "sanity check");
 3002     // we expect one set of extra unsafememory access handler entries
 3003     GrowableArray<address> extras;
 3004     int extra_count =  1 * UnsafeMemoryAccess::COLUMN_COUNT;
 3005     address start = load_archive_data(stub_id, nullptr, &extras);
 3006     if (start != nullptr) {
 3007       assert(extras.length() == extra_count,
 3008              "unexpected extra entry count %d", extras.length());
 3009       register_unsafe_access_handlers(extras, 0, 1);
 3010       return start;
 3011     }
 3012 
 3013     __ align(CodeEntryAlignment);
 3014     StubCodeMark mark(this, stub_id);
 3015     start = __ pc();
 3016 
 3017     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 3018     Label tail;
 3019 
 3020     {
 3021     UnsafeMemoryAccessMark umam(this, true, false);
 3022 
 3023     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3024 
 3025     __ dup(v0, __ T16B, value);
 3026 
 3027     if (AvoidUnalignedAccesses) {
 3028       __ cmp(count, (u1)16);
 3029       __ br(__ LO, tail);
 3030 
 3031       __ mov(rscratch1, 16);
 3032       __ andr(rscratch2, dest, 15);
 3033       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 3034       __ strq(v0, Address(dest));
 3035       __ sub(count, count, rscratch1);
 3036       __ add(dest, dest, rscratch1);
 3037     }
 3038 
 3039     __ subs(count, count, (u1)64);
 3040     __ br(__ LO, tail);
 3041     {
 3042       Label again;
 3043       __ bind(again);
 3044       __ stpq(v0, v0, Address(dest));
 3045       __ stpq(v0, v0, Address(dest, 32));
 3046 
 3047       __ subs(count, count, 64);
 3048       __ add(dest, dest, 64);
 3049       __ br(__ HS, again);
 3050     }
 3051 
 3052     __ bind(tail);
 3053     // The count of bytes is off by 64, but we don't need to correct
 3054     // it because we're only going to use the least-significant few
 3055     // count bits from here on.
 3056     // __ add(count, count, 64);
 3057 
 3058     {
 3059       Label dont;
 3060       __ tbz(count, exact_log2(32), dont);
 3061       __ stpq(v0, v0, __ post(dest, 32));
 3062       __ bind(dont);
 3063     }
 3064     {
 3065       Label dont;
 3066       __ tbz(count, exact_log2(16), dont);
 3067       __ strq(v0, __ post(dest, 16));
 3068       __ bind(dont);
 3069     }
 3070     {
 3071       Label dont;
 3072       __ tbz(count, exact_log2(8), dont);
 3073       __ strd(v0, __ post(dest, 8));
 3074       __ bind(dont);
 3075     }
 3076 
 3077     Label finished;
 3078     __ tst(count, 7);
 3079     __ br(__ EQ, finished);
 3080 
 3081     {
 3082       Label dont;
 3083       __ tbz(count, exact_log2(4), dont);
 3084       __ strs(v0, __ post(dest, 4));
 3085       __ bind(dont);
 3086     }
 3087     {
 3088       Label dont;
 3089       __ tbz(count, exact_log2(2), dont);
 3090       __ bfi(value, value, 8, 8);
 3091       __ strh(value, __ post(dest, 2));
 3092       __ bind(dont);
 3093     }
 3094     {
 3095       Label dont;
 3096       __ tbz(count, exact_log2(1), dont);
 3097       __ strb(value, Address(dest));
 3098       __ bind(dont);
 3099     }
 3100 
 3101     __ bind(finished);
 3102     __ leave();
 3103     __ ret(lr);
 3104     // have to exit the block and destroy the UnsafeMemoryAccessMark
 3105     // in order to retrieve the handler end address
 3106     }
 3107 
 3108     // install saved handler addresses in extras
 3109     address end = __ pc();
 3110     retrieve_unsafe_access_handlers(start, end, extras);
 3111     assert(extras.length() == extra_count,
 3112            "incorrect handlers count %d", extras.length());
 3113     // record the stub entry and end plus the extras
 3114     store_archive_data(stub_id, start, end, nullptr, &extras);
 3115 
 3116     return start;
 3117   }
 3118 
 3119   address generate_data_cache_writeback() {
 3120     const Register line        = c_rarg0;  // address of line to write back
 3121 
 3122     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 3123     int entry_count = StubInfo::entry_count(stub_id);
 3124     assert(entry_count == 1, "sanity check");
 3125     address start = load_archive_data(stub_id);
 3126     if (start != nullptr) {
 3127       return start;
 3128     }
 3129     __ align(CodeEntryAlignment);
 3130     StubCodeMark mark(this, stub_id);
 3131 
 3132     start = __ pc();
 3133     __ enter();
 3134     __ cache_wb(Address(line, 0));
 3135     __ leave();
 3136     __ ret(lr);
 3137 
 3138     // record the stub entry and end
 3139     store_archive_data(stub_id, start, __ pc());
 3140 
 3141     return start;
 3142   }
 3143 
 3144   address generate_data_cache_writeback_sync() {
 3145     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 3146     int entry_count = StubInfo::entry_count(stub_id);
 3147     assert(entry_count == 1, "sanity check");
 3148     address start = load_archive_data(stub_id);
 3149     if (start != nullptr) {
 3150       return start;
 3151     }
 3152     const Register is_pre     = c_rarg0;  // pre or post sync
 3153     __ align(CodeEntryAlignment);
 3154     StubCodeMark mark(this, stub_id);
 3155 
 3156     // pre wbsync is a no-op
 3157     // post wbsync translates to an sfence
 3158 
 3159     Label skip;
 3160     start = __ pc();
 3161     __ enter();
 3162     __ cbnz(is_pre, skip);
 3163     __ cache_wbsync(false);
 3164     __ bind(skip);
 3165     __ leave();
 3166     __ ret(lr);
 3167 
 3168     // record the stub entry and end
 3169     store_archive_data(stub_id, start, __ pc());
 3170 
 3171     return start;
 3172   }
 3173 
 3174   void generate_arraycopy_stubs() {
 3175     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 3176     // entry immediately following their stack push. This can be used
 3177     // as a post-push branch target for compatible stubs when they
 3178     // identify a special case that can be handled by the fallback
 3179     // stub e.g a disjoint copy stub may be use as a special case
 3180     // fallback for its compatible conjoint copy stub.
 3181     //
 3182     // A no push entry is always returned in the following local and
 3183     // then published by assigning to the appropriate entry field in
 3184     // class StubRoutines. The entry value is then passed to the
 3185     // generator for the compatible stub. That means the entry must be
 3186     // listed when saving to/restoring from the AOT cache, ensuring
 3187     // that the inter-stub jumps are noted at AOT-cache save and
 3188     // relocated at AOT cache load.
 3189     address nopush_entry;
 3190 
 3191     // generate the common exit first so later stubs can rely on it if
 3192     // they want an UnsafeMemoryAccess exit non-local to the stub
 3193     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 3194     // register the stub as the default exit with class UnsafeMemoryAccess
 3195     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 3196 
 3197     // generate and publish arch64-specific bulk copy routines first
 3198     // so we can call them from other copy stubs
 3199     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3200     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3201 
 3202     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3203     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3204 
 3205     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3206     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3207 
 3208     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 3209 
 3210     //*** jbyte
 3211     // Always need aligned and unaligned versions
 3212     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3213     // disjoint nopush entry is needed by conjoint copy
 3214     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3215     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 3216     // conjoint nopush entry is needed by generic/unsafe copy
 3217     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 3218     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3219     // disjoint arrayof nopush entry is needed by conjoint copy
 3220     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3221     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 3222 
 3223     //*** jshort
 3224     // Always need aligned and unaligned versions
 3225     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 3226     // disjoint nopush entry is needed by conjoint copy
 3227     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 3228     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 3229     // conjoint nopush entry is used by generic/unsafe copy
 3230     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 3231     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 3232     // disjoint arrayof nopush entry is needed by conjoint copy
 3233     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 3234     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 3235 
 3236     //*** jint
 3237     // Aligned versions
 3238     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 3239     // disjoint arrayof nopush entry is needed by conjoint copy
 3240     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 3241     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 3242     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 3243     // jint_arraycopy_nopush always points to the unaligned version
 3244     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 3245     // disjoint nopush entry is needed by conjoint copy
 3246     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 3247     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 3248     // conjoint nopush entry is needed by generic/unsafe copy
 3249     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 3250 
 3251     //*** jlong
 3252     // It is always aligned
 3253     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 3254     // disjoint arrayof nopush entry is needed by conjoint copy
 3255     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 3256     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 3257     // conjoint nopush entry is needed by generic/unsafe copy
 3258     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 3259     // disjoint normal/nopush and conjoint normal entries are not
 3260     // generated since the arrayof versions are the same
 3261     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 3262     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 3263     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 3264 
 3265     //*** oops
 3266     {
 3267       StubRoutines::_arrayof_oop_disjoint_arraycopy
 3268         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 3269       // disjoint arrayof nopush entry is needed by conjoint copy
 3270       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 3271       StubRoutines::_arrayof_oop_arraycopy
 3272         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 3273       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 3274       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 3275       // Aligned versions without pre-barriers
 3276       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 3277         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 3278       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 3279       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 3280       // note that we don't need a returned nopush entry because the
 3281       // generic/unsafe copy does not cater for uninit arrays.
 3282       StubRoutines::_arrayof_oop_arraycopy_uninit
 3283         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 3284     }
 3285 
 3286     // for oop copies reuse arrayof entries for non-arrayof cases
 3287     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 3288     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 3289     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 3290     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 3291     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 3292     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 3293 
 3294     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 3295     // checkcast nopush entry is needed by generic copy
 3296     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 3297     // note that we don't need a returned nopush entry because the
 3298     // generic copy does not cater for uninit arrays.
 3299     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 3300 
 3301     // unsafe arraycopy may fallback on conjoint stubs
 3302     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3303                                                               StubRoutines::_jshort_arraycopy_nopush,
 3304                                                               StubRoutines::_jint_arraycopy_nopush,
 3305                                                               StubRoutines::_jlong_arraycopy_nopush);
 3306 
 3307     // generic arraycopy may fallback on conjoint stubs
 3308     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3309                                                                StubRoutines::_jshort_arraycopy_nopush,
 3310                                                                StubRoutines::_jint_arraycopy_nopush,
 3311                                                                StubRoutines::_oop_arraycopy_nopush,
 3312                                                                StubRoutines::_jlong_arraycopy_nopush,
 3313                                                                StubRoutines::_checkcast_arraycopy_nopush);
 3314 
 3315     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 3316     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 3317     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 3318     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 3319     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 3320     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 3321   }
 3322 
 3323   void generate_math_stubs() { Unimplemented(); }
 3324 
 3325   // Arguments:
 3326   //
 3327   // Inputs:
 3328   //   c_rarg0   - source byte array address
 3329   //   c_rarg1   - destination byte array address
 3330   //   c_rarg2   - sessionKe (key) in little endian int array
 3331   //
 3332   address generate_aescrypt_encryptBlock() {
 3333     assert(UseAES, "need AES cryptographic extension support");
 3334     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 3335     int entry_count = StubInfo::entry_count(stub_id);
 3336     assert(entry_count == 1, "sanity check");
 3337     address start = load_archive_data(stub_id);
 3338     if (start != nullptr) {
 3339       return start;
 3340     }
 3341     __ align(CodeEntryAlignment);
 3342     StubCodeMark mark(this, stub_id);
 3343 
 3344     const Register from        = c_rarg0;  // source array address
 3345     const Register to          = c_rarg1;  // destination array address
 3346     const Register key         = c_rarg2;  // key array address
 3347     const Register keylen      = rscratch1;
 3348 
 3349     start = __ pc();
 3350     __ enter();
 3351 
 3352     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3353 
 3354     __ aesenc_loadkeys(key, keylen);
 3355     __ aesecb_encrypt(from, to, keylen);
 3356 
 3357     __ mov(r0, 0);
 3358 
 3359     __ leave();
 3360     __ ret(lr);
 3361 
 3362     // record the stub entry and end
 3363     store_archive_data(stub_id, start, __ pc());
 3364 
 3365     return start;
 3366   }
 3367 
 3368   // Arguments:
 3369   //
 3370   // Inputs:
 3371   //   c_rarg0   - source byte array address
 3372   //   c_rarg1   - destination byte array address
 3373   //   c_rarg2   - sessionKd (key) in little endian int array
 3374   //
 3375   address generate_aescrypt_decryptBlock() {
 3376     assert(UseAES, "need AES cryptographic extension support");
 3377     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 3378     int entry_count = StubInfo::entry_count(stub_id);
 3379     assert(entry_count == 1, "sanity check");
 3380     address start = load_archive_data(stub_id);
 3381     if (start != nullptr) {
 3382       return start;
 3383     }
 3384     __ align(CodeEntryAlignment);
 3385     StubCodeMark mark(this, stub_id);
 3386     Label L_doLast;
 3387 
 3388     const Register from        = c_rarg0;  // source array address
 3389     const Register to          = c_rarg1;  // destination array address
 3390     const Register key         = c_rarg2;  // key array address
 3391     const Register keylen      = rscratch1;
 3392 
 3393     start = __ pc();
 3394     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3395 
 3396     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3397 
 3398     __ aesecb_decrypt(from, to, key, keylen);
 3399 
 3400     __ mov(r0, 0);
 3401 
 3402     __ leave();
 3403     __ ret(lr);
 3404 
 3405     // record the stub entry and end
 3406     store_archive_data(stub_id, start, __ pc());
 3407 
 3408     return start;
 3409   }
 3410 
 3411   // Arguments:
 3412   //
 3413   // Inputs:
 3414   //   c_rarg0   - source byte array address
 3415   //   c_rarg1   - destination byte array address
 3416   //   c_rarg2   - sessionKe (key) in little endian int array
 3417   //   c_rarg3   - r vector byte array address
 3418   //   c_rarg4   - input length
 3419   //
 3420   // Output:
 3421   //   x0        - input length
 3422   //
 3423   address generate_cipherBlockChaining_encryptAESCrypt() {
 3424     assert(UseAES, "need AES cryptographic extension support");
 3425     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 3426     int entry_count = StubInfo::entry_count(stub_id);
 3427     assert(entry_count == 1, "sanity check");
 3428     address start = load_archive_data(stub_id);
 3429     if (start != nullptr) {
 3430       return start;
 3431     }
 3432     __ align(CodeEntryAlignment);
 3433     StubCodeMark mark(this, stub_id);
 3434 
 3435     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3436 
 3437     const Register from        = c_rarg0;  // source array address
 3438     const Register to          = c_rarg1;  // destination array address
 3439     const Register key         = c_rarg2;  // key array address
 3440     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3441                                            // and left with the results of the last encryption block
 3442     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3443     const Register keylen      = rscratch1;
 3444 
 3445     start = __ pc();
 3446 
 3447       __ enter();
 3448 
 3449       __ movw(rscratch2, len_reg);
 3450 
 3451       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3452 
 3453       __ ld1(v0, __ T16B, rvec);
 3454 
 3455       __ cmpw(keylen, 52);
 3456       __ br(Assembler::CC, L_loadkeys_44);
 3457       __ br(Assembler::EQ, L_loadkeys_52);
 3458 
 3459       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3460       __ rev32(v17, __ T16B, v17);
 3461       __ rev32(v18, __ T16B, v18);
 3462     __ BIND(L_loadkeys_52);
 3463       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3464       __ rev32(v19, __ T16B, v19);
 3465       __ rev32(v20, __ T16B, v20);
 3466     __ BIND(L_loadkeys_44);
 3467       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3468       __ rev32(v21, __ T16B, v21);
 3469       __ rev32(v22, __ T16B, v22);
 3470       __ rev32(v23, __ T16B, v23);
 3471       __ rev32(v24, __ T16B, v24);
 3472       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3473       __ rev32(v25, __ T16B, v25);
 3474       __ rev32(v26, __ T16B, v26);
 3475       __ rev32(v27, __ T16B, v27);
 3476       __ rev32(v28, __ T16B, v28);
 3477       __ ld1(v29, v30, v31, __ T16B, key);
 3478       __ rev32(v29, __ T16B, v29);
 3479       __ rev32(v30, __ T16B, v30);
 3480       __ rev32(v31, __ T16B, v31);
 3481 
 3482     __ BIND(L_aes_loop);
 3483       __ ld1(v1, __ T16B, __ post(from, 16));
 3484       __ eor(v0, __ T16B, v0, v1);
 3485 
 3486       __ br(Assembler::CC, L_rounds_44);
 3487       __ br(Assembler::EQ, L_rounds_52);
 3488 
 3489       __ aese(v0, v17); __ aesmc(v0, v0);
 3490       __ aese(v0, v18); __ aesmc(v0, v0);
 3491     __ BIND(L_rounds_52);
 3492       __ aese(v0, v19); __ aesmc(v0, v0);
 3493       __ aese(v0, v20); __ aesmc(v0, v0);
 3494     __ BIND(L_rounds_44);
 3495       __ aese(v0, v21); __ aesmc(v0, v0);
 3496       __ aese(v0, v22); __ aesmc(v0, v0);
 3497       __ aese(v0, v23); __ aesmc(v0, v0);
 3498       __ aese(v0, v24); __ aesmc(v0, v0);
 3499       __ aese(v0, v25); __ aesmc(v0, v0);
 3500       __ aese(v0, v26); __ aesmc(v0, v0);
 3501       __ aese(v0, v27); __ aesmc(v0, v0);
 3502       __ aese(v0, v28); __ aesmc(v0, v0);
 3503       __ aese(v0, v29); __ aesmc(v0, v0);
 3504       __ aese(v0, v30);
 3505       __ eor(v0, __ T16B, v0, v31);
 3506 
 3507       __ st1(v0, __ T16B, __ post(to, 16));
 3508 
 3509       __ subw(len_reg, len_reg, 16);
 3510       __ cbnzw(len_reg, L_aes_loop);
 3511 
 3512       __ st1(v0, __ T16B, rvec);
 3513 
 3514       __ mov(r0, rscratch2);
 3515 
 3516       __ leave();
 3517       __ ret(lr);
 3518 
 3519       // record the stub entry and end
 3520       store_archive_data(stub_id, start, __ pc());
 3521 
 3522       return start;
 3523   }
 3524 
 3525   // Arguments:
 3526   //
 3527   // Inputs:
 3528   //   c_rarg0   - source byte array address
 3529   //   c_rarg1   - destination byte array address
 3530   //   c_rarg2   - sessionKd (key) in little endian int array
 3531   //   c_rarg3   - r vector byte array address
 3532   //   c_rarg4   - input length
 3533   //
 3534   // Output:
 3535   //   r0        - input length
 3536   //
 3537   address generate_cipherBlockChaining_decryptAESCrypt() {
 3538     assert(UseAES, "need AES cryptographic extension support");
 3539     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3540     int entry_count = StubInfo::entry_count(stub_id);
 3541     assert(entry_count == 1, "sanity check");
 3542     address start = load_archive_data(stub_id);
 3543     if (start != nullptr) {
 3544       return start;
 3545     }
 3546     __ align(CodeEntryAlignment);
 3547     StubCodeMark mark(this, stub_id);
 3548 
 3549     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3550 
 3551     const Register from        = c_rarg0;  // source array address
 3552     const Register to          = c_rarg1;  // destination array address
 3553     const Register key         = c_rarg2;  // key array address
 3554     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3555                                            // and left with the results of the last encryption block
 3556     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3557     const Register keylen      = rscratch1;
 3558 
 3559     start = __ pc();
 3560 
 3561       __ enter();
 3562 
 3563       __ movw(rscratch2, len_reg);
 3564 
 3565       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3566 
 3567       __ ld1(v2, __ T16B, rvec);
 3568 
 3569       __ ld1(v31, __ T16B, __ post(key, 16));
 3570       __ rev32(v31, __ T16B, v31);
 3571 
 3572       __ cmpw(keylen, 52);
 3573       __ br(Assembler::CC, L_loadkeys_44);
 3574       __ br(Assembler::EQ, L_loadkeys_52);
 3575 
 3576       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3577       __ rev32(v17, __ T16B, v17);
 3578       __ rev32(v18, __ T16B, v18);
 3579     __ BIND(L_loadkeys_52);
 3580       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3581       __ rev32(v19, __ T16B, v19);
 3582       __ rev32(v20, __ T16B, v20);
 3583     __ BIND(L_loadkeys_44);
 3584       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3585       __ rev32(v21, __ T16B, v21);
 3586       __ rev32(v22, __ T16B, v22);
 3587       __ rev32(v23, __ T16B, v23);
 3588       __ rev32(v24, __ T16B, v24);
 3589       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3590       __ rev32(v25, __ T16B, v25);
 3591       __ rev32(v26, __ T16B, v26);
 3592       __ rev32(v27, __ T16B, v27);
 3593       __ rev32(v28, __ T16B, v28);
 3594       __ ld1(v29, v30, __ T16B, key);
 3595       __ rev32(v29, __ T16B, v29);
 3596       __ rev32(v30, __ T16B, v30);
 3597 
 3598     __ BIND(L_aes_loop);
 3599       __ ld1(v0, __ T16B, __ post(from, 16));
 3600       __ orr(v1, __ T16B, v0, v0);
 3601 
 3602       __ br(Assembler::CC, L_rounds_44);
 3603       __ br(Assembler::EQ, L_rounds_52);
 3604 
 3605       __ aesd(v0, v17); __ aesimc(v0, v0);
 3606       __ aesd(v0, v18); __ aesimc(v0, v0);
 3607     __ BIND(L_rounds_52);
 3608       __ aesd(v0, v19); __ aesimc(v0, v0);
 3609       __ aesd(v0, v20); __ aesimc(v0, v0);
 3610     __ BIND(L_rounds_44);
 3611       __ aesd(v0, v21); __ aesimc(v0, v0);
 3612       __ aesd(v0, v22); __ aesimc(v0, v0);
 3613       __ aesd(v0, v23); __ aesimc(v0, v0);
 3614       __ aesd(v0, v24); __ aesimc(v0, v0);
 3615       __ aesd(v0, v25); __ aesimc(v0, v0);
 3616       __ aesd(v0, v26); __ aesimc(v0, v0);
 3617       __ aesd(v0, v27); __ aesimc(v0, v0);
 3618       __ aesd(v0, v28); __ aesimc(v0, v0);
 3619       __ aesd(v0, v29); __ aesimc(v0, v0);
 3620       __ aesd(v0, v30);
 3621       __ eor(v0, __ T16B, v0, v31);
 3622       __ eor(v0, __ T16B, v0, v2);
 3623 
 3624       __ st1(v0, __ T16B, __ post(to, 16));
 3625       __ orr(v2, __ T16B, v1, v1);
 3626 
 3627       __ subw(len_reg, len_reg, 16);
 3628       __ cbnzw(len_reg, L_aes_loop);
 3629 
 3630       __ st1(v2, __ T16B, rvec);
 3631 
 3632       __ mov(r0, rscratch2);
 3633 
 3634       __ leave();
 3635       __ ret(lr);
 3636 
 3637     // record the stub entry and end
 3638     store_archive_data(stub_id, start, __ pc());
 3639 
 3640     return start;
 3641   }
 3642 
 3643   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3644   // Inputs: 128-bits. in is preserved.
 3645   // The least-significant 64-bit word is in the upper dword of each vector.
 3646   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3647   // Output: result
 3648   void be_add_128_64(FloatRegister result, FloatRegister in,
 3649                      FloatRegister inc, FloatRegister tmp) {
 3650     assert_different_registers(result, tmp, inc);
 3651 
 3652     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3653                                            // input
 3654     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3655     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3656                                            // MSD == 0 (must be!) to LSD
 3657     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3658   }
 3659 
 3660   // CTR AES crypt.
 3661   // Arguments:
 3662   //
 3663   // Inputs:
 3664   //   c_rarg0   - source byte array address
 3665   //   c_rarg1   - destination byte array address
 3666   //   c_rarg2   - sessionKe (key) in little endian int array
 3667   //   c_rarg3   - counter vector byte array address
 3668   //   c_rarg4   - input length
 3669   //   c_rarg5   - saved encryptedCounter start
 3670   //   c_rarg6   - saved used length
 3671   //
 3672   // Output:
 3673   //   r0       - input length
 3674   //
 3675   address generate_counterMode_AESCrypt() {
 3676     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3677     int entry_count = StubInfo::entry_count(stub_id);
 3678     assert(entry_count == 1, "sanity check");
 3679     address start = load_archive_data(stub_id);
 3680     if (start != nullptr) {
 3681       return start;
 3682     }
 3683     const Register in = c_rarg0;
 3684     const Register out = c_rarg1;
 3685     const Register key = c_rarg2;
 3686     const Register counter = c_rarg3;
 3687     const Register saved_len = c_rarg4, len = r10;
 3688     const Register saved_encrypted_ctr = c_rarg5;
 3689     const Register used_ptr = c_rarg6, used = r12;
 3690 
 3691     const Register offset = r7;
 3692     const Register keylen = r11;
 3693 
 3694     const unsigned char block_size = 16;
 3695     const int bulk_width = 4;
 3696     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3697     // performance with larger data sizes, but it also means that the
 3698     // fast path isn't used until you have at least 8 blocks, and up
 3699     // to 127 bytes of data will be executed on the slow path. For
 3700     // that reason, and also so as not to blow away too much icache, 4
 3701     // blocks seems like a sensible compromise.
 3702 
 3703     // Algorithm:
 3704     //
 3705     //    if (len == 0) {
 3706     //        goto DONE;
 3707     //    }
 3708     //    int result = len;
 3709     //    do {
 3710     //        if (used >= blockSize) {
 3711     //            if (len >= bulk_width * blockSize) {
 3712     //                CTR_large_block();
 3713     //                if (len == 0)
 3714     //                    goto DONE;
 3715     //            }
 3716     //            for (;;) {
 3717     //                16ByteVector v0 = counter;
 3718     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3719     //                used = 0;
 3720     //                if (len < blockSize)
 3721     //                    break;    /* goto NEXT */
 3722     //                16ByteVector v1 = load16Bytes(in, offset);
 3723     //                v1 = v1 ^ encryptedCounter;
 3724     //                store16Bytes(out, offset);
 3725     //                used = blockSize;
 3726     //                offset += blockSize;
 3727     //                len -= blockSize;
 3728     //                if (len == 0)
 3729     //                    goto DONE;
 3730     //            }
 3731     //        }
 3732     //      NEXT:
 3733     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3734     //        len--;
 3735     //    } while (len != 0);
 3736     //  DONE:
 3737     //    return result;
 3738     //
 3739     // CTR_large_block()
 3740     //    Wide bulk encryption of whole blocks.
 3741 
 3742     __ align(CodeEntryAlignment);
 3743     StubCodeMark mark(this, stub_id);
 3744     start = __ pc();
 3745     __ enter();
 3746 
 3747     Label DONE, CTR_large_block, large_block_return;
 3748     __ ldrw(used, Address(used_ptr));
 3749     __ cbzw(saved_len, DONE);
 3750 
 3751     __ mov(len, saved_len);
 3752     __ mov(offset, 0);
 3753 
 3754     // Compute #rounds for AES based on the length of the key array
 3755     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3756 
 3757     __ aesenc_loadkeys(key, keylen);
 3758 
 3759     {
 3760       Label L_CTR_loop, NEXT;
 3761 
 3762       __ bind(L_CTR_loop);
 3763 
 3764       __ cmp(used, block_size);
 3765       __ br(__ LO, NEXT);
 3766 
 3767       // Maybe we have a lot of data
 3768       __ subsw(rscratch1, len, bulk_width * block_size);
 3769       __ br(__ HS, CTR_large_block);
 3770       __ BIND(large_block_return);
 3771       __ cbzw(len, DONE);
 3772 
 3773       // Setup the counter
 3774       __ movi(v4, __ T4S, 0);
 3775       __ movi(v5, __ T4S, 1);
 3776       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3777 
 3778       // 128-bit big-endian increment
 3779       __ ld1(v0, __ T16B, counter);
 3780       __ rev64(v16, __ T16B, v0);
 3781       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3782       __ rev64(v16, __ T16B, v16);
 3783       __ st1(v16, __ T16B, counter);
 3784       // Previous counter value is in v0
 3785       // v4 contains { 0, 1 }
 3786 
 3787       {
 3788         // We have fewer than bulk_width blocks of data left. Encrypt
 3789         // them one by one until there is less than a full block
 3790         // remaining, being careful to save both the encrypted counter
 3791         // and the counter.
 3792 
 3793         Label inner_loop;
 3794         __ bind(inner_loop);
 3795         // Counter to encrypt is in v0
 3796         __ aesecb_encrypt(noreg, noreg, keylen);
 3797         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3798 
 3799         // Do we have a remaining full block?
 3800 
 3801         __ mov(used, 0);
 3802         __ cmp(len, block_size);
 3803         __ br(__ LO, NEXT);
 3804 
 3805         // Yes, we have a full block
 3806         __ ldrq(v1, Address(in, offset));
 3807         __ eor(v1, __ T16B, v1, v0);
 3808         __ strq(v1, Address(out, offset));
 3809         __ mov(used, block_size);
 3810         __ add(offset, offset, block_size);
 3811 
 3812         __ subw(len, len, block_size);
 3813         __ cbzw(len, DONE);
 3814 
 3815         // Increment the counter, store it back
 3816         __ orr(v0, __ T16B, v16, v16);
 3817         __ rev64(v16, __ T16B, v16);
 3818         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3819         __ rev64(v16, __ T16B, v16);
 3820         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3821 
 3822         __ b(inner_loop);
 3823       }
 3824 
 3825       __ BIND(NEXT);
 3826 
 3827       // Encrypt a single byte, and loop.
 3828       // We expect this to be a rare event.
 3829       __ ldrb(rscratch1, Address(in, offset));
 3830       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3831       __ eor(rscratch1, rscratch1, rscratch2);
 3832       __ strb(rscratch1, Address(out, offset));
 3833       __ add(offset, offset, 1);
 3834       __ add(used, used, 1);
 3835       __ subw(len, len,1);
 3836       __ cbnzw(len, L_CTR_loop);
 3837     }
 3838 
 3839     __ bind(DONE);
 3840     __ strw(used, Address(used_ptr));
 3841     __ mov(r0, saved_len);
 3842 
 3843     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3844     __ ret(lr);
 3845 
 3846     // Bulk encryption
 3847 
 3848     __ BIND (CTR_large_block);
 3849     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3850 
 3851     if (bulk_width == 8) {
 3852       __ sub(sp, sp, 4 * 16);
 3853       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3854     }
 3855     __ sub(sp, sp, 4 * 16);
 3856     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3857     RegSet saved_regs = (RegSet::of(in, out, offset)
 3858                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3859     __ push(saved_regs, sp);
 3860     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3861     __ add(in, in, offset);
 3862     __ add(out, out, offset);
 3863 
 3864     // Keys should already be loaded into the correct registers
 3865 
 3866     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3867     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3868 
 3869     // AES/CTR loop
 3870     {
 3871       Label L_CTR_loop;
 3872       __ BIND(L_CTR_loop);
 3873 
 3874       // Setup the counters
 3875       __ movi(v8, __ T4S, 0);
 3876       __ movi(v9, __ T4S, 1);
 3877       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3878 
 3879       for (int i = 0; i < bulk_width; i++) {
 3880         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3881         __ rev64(v0_ofs, __ T16B, v16);
 3882         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3883       }
 3884 
 3885       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3886 
 3887       // Encrypt the counters
 3888       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3889 
 3890       if (bulk_width == 8) {
 3891         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3892       }
 3893 
 3894       // XOR the encrypted counters with the inputs
 3895       for (int i = 0; i < bulk_width; i++) {
 3896         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3897         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3898         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3899       }
 3900 
 3901       // Write the encrypted data
 3902       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3903       if (bulk_width == 8) {
 3904         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3905       }
 3906 
 3907       __ subw(len, len, 16 * bulk_width);
 3908       __ cbnzw(len, L_CTR_loop);
 3909     }
 3910 
 3911     // Save the counter back where it goes
 3912     __ rev64(v16, __ T16B, v16);
 3913     __ st1(v16, __ T16B, counter);
 3914 
 3915     __ pop(saved_regs, sp);
 3916 
 3917     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3918     if (bulk_width == 8) {
 3919       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3920     }
 3921 
 3922     __ andr(rscratch1, len, -16 * bulk_width);
 3923     __ sub(len, len, rscratch1);
 3924     __ add(offset, offset, rscratch1);
 3925     __ mov(used, 16);
 3926     __ strw(used, Address(used_ptr));
 3927     __ b(large_block_return);
 3928 
 3929     // record the stub entry and end
 3930     store_archive_data(stub_id, start, __ pc());
 3931 
 3932     return start;
 3933   }
 3934 
 3935   // Vector AES Galois Counter Mode implementation. Parameters:
 3936   //
 3937   // in = c_rarg0
 3938   // len = c_rarg1
 3939   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3940   // out = c_rarg3
 3941   // key = c_rarg4
 3942   // state = c_rarg5 - GHASH.state
 3943   // subkeyHtbl = c_rarg6 - powers of H
 3944   // counter = c_rarg7 - 16 bytes of CTR
 3945   // return - number of processed bytes
 3946   address generate_galoisCounterMode_AESCrypt() {
 3947     Label ghash_polynomial; // local data generated after code
 3948     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3949     int entry_count = StubInfo::entry_count(stub_id);
 3950     assert(entry_count == 1, "sanity check");
 3951     address start = load_archive_data(stub_id);
 3952     if (start != nullptr) {
 3953       return start;
 3954     }
 3955     __ align(CodeEntryAlignment);
 3956     StubCodeMark mark(this, stub_id);
 3957     start = __ pc();
 3958     __ enter();
 3959 
 3960     const Register in = c_rarg0;
 3961     const Register len = c_rarg1;
 3962     const Register ct = c_rarg2;
 3963     const Register out = c_rarg3;
 3964     // and updated with the incremented counter in the end
 3965 
 3966     const Register key = c_rarg4;
 3967     const Register state = c_rarg5;
 3968 
 3969     const Register subkeyHtbl = c_rarg6;
 3970 
 3971     const Register counter = c_rarg7;
 3972 
 3973     const Register keylen = r10;
 3974     // Save state before entering routine
 3975     __ sub(sp, sp, 4 * 16);
 3976     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3977     __ sub(sp, sp, 4 * 16);
 3978     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3979 
 3980     // __ andr(len, len, -512);
 3981     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3982     __ str(len, __ pre(sp, -2 * wordSize));
 3983 
 3984     Label DONE;
 3985     __ cbz(len, DONE);
 3986 
 3987     // Compute #rounds for AES based on the length of the key array
 3988     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3989 
 3990     __ aesenc_loadkeys(key, keylen);
 3991     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3992     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3993 
 3994     // AES/CTR loop
 3995     {
 3996       Label L_CTR_loop;
 3997       __ BIND(L_CTR_loop);
 3998 
 3999       // Setup the counters
 4000       __ movi(v8, __ T4S, 0);
 4001       __ movi(v9, __ T4S, 1);
 4002       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 4003 
 4004       assert(v0->encoding() < v8->encoding(), "");
 4005       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 4006         FloatRegister f = as_FloatRegister(i);
 4007         __ rev32(f, __ T16B, v16);
 4008         __ addv(v16, __ T4S, v16, v8);
 4009       }
 4010 
 4011       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 4012 
 4013       // Encrypt the counters
 4014       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 4015 
 4016       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 4017 
 4018       // XOR the encrypted counters with the inputs
 4019       for (int i = 0; i < 8; i++) {
 4020         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 4021         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 4022         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 4023       }
 4024       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 4025       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 4026 
 4027       __ subw(len, len, 16 * 8);
 4028       __ cbnzw(len, L_CTR_loop);
 4029     }
 4030 
 4031     __ rev32(v16, __ T16B, v16);
 4032     __ st1(v16, __ T16B, counter);
 4033 
 4034     __ ldr(len, Address(sp));
 4035     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 4036 
 4037     // GHASH/CTR loop
 4038     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 4039                                 len, /*unrolls*/4);
 4040 
 4041 #ifdef ASSERT
 4042     { Label L;
 4043       __ cmp(len, (unsigned char)0);
 4044       __ br(Assembler::EQ, L);
 4045       __ stop("stubGenerator: abort");
 4046       __ bind(L);
 4047   }
 4048 #endif
 4049 
 4050   __ bind(DONE);
 4051     // Return the number of bytes processed
 4052     __ ldr(r0, __ post(sp, 2 * wordSize));
 4053 
 4054     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 4055     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 4056 
 4057     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4058     __ ret(lr);
 4059 
 4060     // bind label and generate polynomial data
 4061     __ align(wordSize * 2);
 4062     __ bind(ghash_polynomial);
 4063     __ emit_int64(0x87);  // The low-order bits of the field
 4064                           // polynomial (i.e. p = z^7+z^2+z+1)
 4065                           // repeated in the low and high parts of a
 4066                           // 128-bit vector
 4067     __ emit_int64(0x87);
 4068 
 4069     // record the stub entry and end
 4070     store_archive_data(stub_id, start, __ pc());
 4071 
 4072     return start;
 4073   }
 4074 
 4075   class Cached64Bytes {
 4076   private:
 4077     MacroAssembler *_masm;
 4078     Register _regs[8];
 4079 
 4080   public:
 4081     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 4082       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 4083       auto it = rs.begin();
 4084       for (auto &r: _regs) {
 4085         r = *it;
 4086         ++it;
 4087       }
 4088     }
 4089 
 4090     void gen_loads(Register base) {
 4091       for (int i = 0; i < 8; i += 2) {
 4092         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 4093       }
 4094     }
 4095 
 4096     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 4097     void extract_u32(Register dest, int i) {
 4098       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 4099     }
 4100   };
 4101 
 4102   // Utility routines for md5.
 4103   // Clobbers r10 and r11.
 4104   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4105               int k, int s, int t) {
 4106     Register rscratch3 = r10;
 4107     Register rscratch4 = r11;
 4108 
 4109     __ eorw(rscratch3, r3, r4);
 4110     __ movw(rscratch2, t);
 4111     __ andw(rscratch3, rscratch3, r2);
 4112     __ addw(rscratch4, r1, rscratch2);
 4113     reg_cache.extract_u32(rscratch1, k);
 4114     __ eorw(rscratch3, rscratch3, r4);
 4115     __ addw(rscratch4, rscratch4, rscratch1);
 4116     __ addw(rscratch3, rscratch3, rscratch4);
 4117     __ rorw(rscratch2, rscratch3, 32 - s);
 4118     __ addw(r1, rscratch2, r2);
 4119   }
 4120 
 4121   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4122               int k, int s, int t) {
 4123     Register rscratch3 = r10;
 4124     Register rscratch4 = r11;
 4125 
 4126     reg_cache.extract_u32(rscratch1, k);
 4127     __ movw(rscratch2, t);
 4128     __ addw(rscratch4, r1, rscratch2);
 4129     __ addw(rscratch4, rscratch4, rscratch1);
 4130     __ bicw(rscratch2, r3, r4);
 4131     __ andw(rscratch3, r2, r4);
 4132     __ addw(rscratch2, rscratch2, rscratch4);
 4133     __ addw(rscratch2, rscratch2, rscratch3);
 4134     __ rorw(rscratch2, rscratch2, 32 - s);
 4135     __ addw(r1, rscratch2, r2);
 4136   }
 4137 
 4138   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4139               int k, int s, int t) {
 4140     Register rscratch3 = r10;
 4141     Register rscratch4 = r11;
 4142 
 4143     __ eorw(rscratch3, r3, r4);
 4144     __ movw(rscratch2, t);
 4145     __ addw(rscratch4, r1, rscratch2);
 4146     reg_cache.extract_u32(rscratch1, k);
 4147     __ eorw(rscratch3, rscratch3, r2);
 4148     __ addw(rscratch4, rscratch4, rscratch1);
 4149     __ addw(rscratch3, rscratch3, rscratch4);
 4150     __ rorw(rscratch2, rscratch3, 32 - s);
 4151     __ addw(r1, rscratch2, r2);
 4152   }
 4153 
 4154   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4155               int k, int s, int t) {
 4156     Register rscratch3 = r10;
 4157     Register rscratch4 = r11;
 4158 
 4159     __ movw(rscratch3, t);
 4160     __ ornw(rscratch2, r2, r4);
 4161     __ addw(rscratch4, r1, rscratch3);
 4162     reg_cache.extract_u32(rscratch1, k);
 4163     __ eorw(rscratch3, rscratch2, r3);
 4164     __ addw(rscratch4, rscratch4, rscratch1);
 4165     __ addw(rscratch3, rscratch3, rscratch4);
 4166     __ rorw(rscratch2, rscratch3, 32 - s);
 4167     __ addw(r1, rscratch2, r2);
 4168   }
 4169 
 4170   // Arguments:
 4171   //
 4172   // Inputs:
 4173   //   c_rarg0   - byte[]  source+offset
 4174   //   c_rarg1   - int[]   SHA.state
 4175   //   c_rarg2   - int     offset
 4176   //   c_rarg3   - int     limit
 4177   //
 4178   address generate_md5_implCompress(StubId stub_id) {
 4179     bool multi_block;
 4180     switch (stub_id) {
 4181     case StubId::stubgen_md5_implCompress_id:
 4182       multi_block = false;
 4183       break;
 4184     case StubId::stubgen_md5_implCompressMB_id:
 4185       multi_block = true;
 4186       break;
 4187     default:
 4188       ShouldNotReachHere();
 4189     }
 4190     int entry_count = StubInfo::entry_count(stub_id);
 4191     assert(entry_count == 1, "sanity check");
 4192     address start = load_archive_data(stub_id);
 4193     if (start != nullptr) {
 4194       return start;
 4195     }
 4196     __ align(CodeEntryAlignment);
 4197 
 4198     StubCodeMark mark(this, stub_id);
 4199     start = __ pc();
 4200 
 4201     Register buf       = c_rarg0;
 4202     Register state     = c_rarg1;
 4203     Register ofs       = c_rarg2;
 4204     Register limit     = c_rarg3;
 4205     Register a         = r4;
 4206     Register b         = r5;
 4207     Register c         = r6;
 4208     Register d         = r7;
 4209     Register rscratch3 = r10;
 4210     Register rscratch4 = r11;
 4211 
 4212     Register state_regs[2] = { r12, r13 };
 4213     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 4214     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 4215 
 4216     __ push(saved_regs, sp);
 4217 
 4218     __ ldp(state_regs[0], state_regs[1], Address(state));
 4219     __ ubfx(a, state_regs[0],  0, 32);
 4220     __ ubfx(b, state_regs[0], 32, 32);
 4221     __ ubfx(c, state_regs[1],  0, 32);
 4222     __ ubfx(d, state_regs[1], 32, 32);
 4223 
 4224     Label md5_loop;
 4225     __ BIND(md5_loop);
 4226 
 4227     reg_cache.gen_loads(buf);
 4228 
 4229     // Round 1
 4230     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 4231     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 4232     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 4233     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 4234     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 4235     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 4236     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 4237     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 4238     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 4239     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 4240     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 4241     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 4242     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 4243     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 4244     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 4245     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 4246 
 4247     // Round 2
 4248     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 4249     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 4250     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 4251     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 4252     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 4253     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 4254     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 4255     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 4256     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 4257     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 4258     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 4259     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 4260     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 4261     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 4262     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 4263     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 4264 
 4265     // Round 3
 4266     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 4267     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 4268     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 4269     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 4270     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 4271     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 4272     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 4273     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 4274     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 4275     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 4276     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 4277     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 4278     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 4279     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 4280     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 4281     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 4282 
 4283     // Round 4
 4284     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 4285     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 4286     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 4287     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 4288     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 4289     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 4290     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 4291     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 4292     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 4293     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 4294     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 4295     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 4296     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 4297     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 4298     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 4299     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 4300 
 4301     __ addw(a, state_regs[0], a);
 4302     __ ubfx(rscratch2, state_regs[0], 32, 32);
 4303     __ addw(b, rscratch2, b);
 4304     __ addw(c, state_regs[1], c);
 4305     __ ubfx(rscratch4, state_regs[1], 32, 32);
 4306     __ addw(d, rscratch4, d);
 4307 
 4308     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 4309     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 4310 
 4311     if (multi_block) {
 4312       __ add(buf, buf, 64);
 4313       __ add(ofs, ofs, 64);
 4314       __ cmp(ofs, limit);
 4315       __ br(Assembler::LE, md5_loop);
 4316       __ mov(c_rarg0, ofs); // return ofs
 4317     }
 4318 
 4319     // write hash values back in the correct order
 4320     __ stp(state_regs[0], state_regs[1], Address(state));
 4321 
 4322     __ pop(saved_regs, sp);
 4323 
 4324     __ ret(lr);
 4325 
 4326     // record the stub entry and end
 4327     store_archive_data(stub_id, start, __ pc());
 4328 
 4329     return start;
 4330   }
 4331 
 4332   // Arguments:
 4333   //
 4334   // Inputs:
 4335   //   c_rarg0   - byte[]  source+offset
 4336   //   c_rarg1   - int[]   SHA.state
 4337   //   c_rarg2   - int     offset
 4338   //   c_rarg3   - int     limit
 4339   //
 4340   address generate_sha1_implCompress(StubId stub_id) {
 4341     bool multi_block;
 4342     switch (stub_id) {
 4343     case StubId::stubgen_sha1_implCompress_id:
 4344       multi_block = false;
 4345       break;
 4346     case StubId::stubgen_sha1_implCompressMB_id:
 4347       multi_block = true;
 4348       break;
 4349     default:
 4350       ShouldNotReachHere();
 4351     }
 4352     int entry_count = StubInfo::entry_count(stub_id);
 4353     assert(entry_count == 1, "sanity check");
 4354     address start = load_archive_data(stub_id);
 4355     if (start != nullptr) {
 4356       return start;
 4357     }
 4358     __ align(CodeEntryAlignment);
 4359 
 4360     StubCodeMark mark(this, stub_id);
 4361     start = __ pc();
 4362 
 4363     Register buf   = c_rarg0;
 4364     Register state = c_rarg1;
 4365     Register ofs   = c_rarg2;
 4366     Register limit = c_rarg3;
 4367 
 4368     Label keys;
 4369     Label sha1_loop;
 4370 
 4371     // load the keys into v0..v3
 4372     __ adr(rscratch1, keys);
 4373     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 4374     // load 5 words state into v6, v7
 4375     __ ldrq(v6, Address(state, 0));
 4376     __ ldrs(v7, Address(state, 16));
 4377 
 4378 
 4379     __ BIND(sha1_loop);
 4380     // load 64 bytes of data into v16..v19
 4381     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4382     __ rev32(v16, __ T16B, v16);
 4383     __ rev32(v17, __ T16B, v17);
 4384     __ rev32(v18, __ T16B, v18);
 4385     __ rev32(v19, __ T16B, v19);
 4386 
 4387     // do the sha1
 4388     __ addv(v4, __ T4S, v16, v0);
 4389     __ orr(v20, __ T16B, v6, v6);
 4390 
 4391     FloatRegister d0 = v16;
 4392     FloatRegister d1 = v17;
 4393     FloatRegister d2 = v18;
 4394     FloatRegister d3 = v19;
 4395 
 4396     for (int round = 0; round < 20; round++) {
 4397       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 4398       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 4399       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 4400       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 4401       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 4402 
 4403       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 4404       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 4405       __ sha1h(tmp2, __ T4S, v20);
 4406       if (round < 5)
 4407         __ sha1c(v20, __ T4S, tmp3, tmp4);
 4408       else if (round < 10 || round >= 15)
 4409         __ sha1p(v20, __ T4S, tmp3, tmp4);
 4410       else
 4411         __ sha1m(v20, __ T4S, tmp3, tmp4);
 4412       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 4413 
 4414       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4415     }
 4416 
 4417     __ addv(v7, __ T2S, v7, v21);
 4418     __ addv(v6, __ T4S, v6, v20);
 4419 
 4420     if (multi_block) {
 4421       __ add(ofs, ofs, 64);
 4422       __ cmp(ofs, limit);
 4423       __ br(Assembler::LE, sha1_loop);
 4424       __ mov(c_rarg0, ofs); // return ofs
 4425     }
 4426 
 4427     __ strq(v6, Address(state, 0));
 4428     __ strs(v7, Address(state, 16));
 4429 
 4430     __ ret(lr);
 4431 
 4432     __ bind(keys);
 4433     __ emit_int32(0x5a827999);
 4434     __ emit_int32(0x6ed9eba1);
 4435     __ emit_int32(0x8f1bbcdc);
 4436     __ emit_int32(0xca62c1d6);
 4437 
 4438     // record the stub entry and end
 4439     store_archive_data(stub_id, start, __ pc());
 4440 
 4441     return start;
 4442   }
 4443 
 4444 
 4445   // Arguments:
 4446   //
 4447   // Inputs:
 4448   //   c_rarg0   - byte[]  source+offset
 4449   //   c_rarg1   - int[]   SHA.state
 4450   //   c_rarg2   - int     offset
 4451   //   c_rarg3   - int     limit
 4452   //
 4453   address generate_sha256_implCompress(StubId stub_id) {
 4454     bool multi_block;
 4455     switch (stub_id) {
 4456     case StubId::stubgen_sha256_implCompress_id:
 4457       multi_block = false;
 4458       break;
 4459     case StubId::stubgen_sha256_implCompressMB_id:
 4460       multi_block = true;
 4461       break;
 4462     default:
 4463       ShouldNotReachHere();
 4464     }
 4465     int entry_count = StubInfo::entry_count(stub_id);
 4466     assert(entry_count == 1, "sanity check");
 4467     address start = load_archive_data(stub_id);
 4468     if (start != nullptr) {
 4469       return start;
 4470     }
 4471     __ align(CodeEntryAlignment);
 4472     StubCodeMark mark(this, stub_id);
 4473     start = __ pc();
 4474 
 4475     Register buf   = c_rarg0;
 4476     Register state = c_rarg1;
 4477     Register ofs   = c_rarg2;
 4478     Register limit = c_rarg3;
 4479 
 4480     Label sha1_loop;
 4481 
 4482     __ stpd(v8, v9, __ pre(sp, -32));
 4483     __ stpd(v10, v11, Address(sp, 16));
 4484 
 4485 // dga == v0
 4486 // dgb == v1
 4487 // dg0 == v2
 4488 // dg1 == v3
 4489 // dg2 == v4
 4490 // t0 == v6
 4491 // t1 == v7
 4492 
 4493     // load 16 keys to v16..v31
 4494     __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
 4495     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 4496     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 4497     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 4498     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 4499 
 4500     // load 8 words (256 bits) state
 4501     __ ldpq(v0, v1, state);
 4502 
 4503     __ BIND(sha1_loop);
 4504     // load 64 bytes of data into v8..v11
 4505     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4506     __ rev32(v8, __ T16B, v8);
 4507     __ rev32(v9, __ T16B, v9);
 4508     __ rev32(v10, __ T16B, v10);
 4509     __ rev32(v11, __ T16B, v11);
 4510 
 4511     __ addv(v6, __ T4S, v8, v16);
 4512     __ orr(v2, __ T16B, v0, v0);
 4513     __ orr(v3, __ T16B, v1, v1);
 4514 
 4515     FloatRegister d0 = v8;
 4516     FloatRegister d1 = v9;
 4517     FloatRegister d2 = v10;
 4518     FloatRegister d3 = v11;
 4519 
 4520 
 4521     for (int round = 0; round < 16; round++) {
 4522       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4523       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4524       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4525       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4526 
 4527       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4528        __ orr(v4, __ T16B, v2, v2);
 4529       if (round < 15)
 4530         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4531       __ sha256h(v2, __ T4S, v3, tmp2);
 4532       __ sha256h2(v3, __ T4S, v4, tmp2);
 4533       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4534 
 4535       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4536     }
 4537 
 4538     __ addv(v0, __ T4S, v0, v2);
 4539     __ addv(v1, __ T4S, v1, v3);
 4540 
 4541     if (multi_block) {
 4542       __ add(ofs, ofs, 64);
 4543       __ cmp(ofs, limit);
 4544       __ br(Assembler::LE, sha1_loop);
 4545       __ mov(c_rarg0, ofs); // return ofs
 4546     }
 4547 
 4548     __ ldpd(v10, v11, Address(sp, 16));
 4549     __ ldpd(v8, v9, __ post(sp, 32));
 4550 
 4551     __ stpq(v0, v1, state);
 4552 
 4553     __ ret(lr);
 4554 
 4555     // record the stub entry and end
 4556     store_archive_data(stub_id, start, __ pc());
 4557 
 4558     return start;
 4559   }
 4560 
 4561   // Double rounds for sha512.
 4562   void sha512_dround(int dr,
 4563                      FloatRegister vi0, FloatRegister vi1,
 4564                      FloatRegister vi2, FloatRegister vi3,
 4565                      FloatRegister vi4, FloatRegister vrc0,
 4566                      FloatRegister vrc1, FloatRegister vin0,
 4567                      FloatRegister vin1, FloatRegister vin2,
 4568                      FloatRegister vin3, FloatRegister vin4) {
 4569       if (dr < 36) {
 4570         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4571       }
 4572       __ addv(v5, __ T2D, vrc0, vin0);
 4573       __ ext(v6, __ T16B, vi2, vi3, 8);
 4574       __ ext(v5, __ T16B, v5, v5, 8);
 4575       __ ext(v7, __ T16B, vi1, vi2, 8);
 4576       __ addv(vi3, __ T2D, vi3, v5);
 4577       if (dr < 32) {
 4578         __ ext(v5, __ T16B, vin3, vin4, 8);
 4579         __ sha512su0(vin0, __ T2D, vin1);
 4580       }
 4581       __ sha512h(vi3, __ T2D, v6, v7);
 4582       if (dr < 32) {
 4583         __ sha512su1(vin0, __ T2D, vin2, v5);
 4584       }
 4585       __ addv(vi4, __ T2D, vi1, vi3);
 4586       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4587   }
 4588 
 4589   // Arguments:
 4590   //
 4591   // Inputs:
 4592   //   c_rarg0   - byte[]  source+offset
 4593   //   c_rarg1   - int[]   SHA.state
 4594   //   c_rarg2   - int     offset
 4595   //   c_rarg3   - int     limit
 4596   //
 4597   address generate_sha512_implCompress(StubId stub_id) {
 4598     bool multi_block;
 4599     switch (stub_id) {
 4600     case StubId::stubgen_sha512_implCompress_id:
 4601       multi_block = false;
 4602       break;
 4603     case StubId::stubgen_sha512_implCompressMB_id:
 4604       multi_block = true;
 4605       break;
 4606     default:
 4607       ShouldNotReachHere();
 4608     }
 4609     int entry_count = StubInfo::entry_count(stub_id);
 4610     assert(entry_count == 1, "sanity check");
 4611     address start = load_archive_data(stub_id);
 4612     if (start != nullptr) {
 4613       return start;
 4614     }
 4615     __ align(CodeEntryAlignment);
 4616     StubCodeMark mark(this, stub_id);
 4617     start = __ pc();
 4618 
 4619     Register buf   = c_rarg0;
 4620     Register state = c_rarg1;
 4621     Register ofs   = c_rarg2;
 4622     Register limit = c_rarg3;
 4623 
 4624     __ stpd(v8, v9, __ pre(sp, -64));
 4625     __ stpd(v10, v11, Address(sp, 16));
 4626     __ stpd(v12, v13, Address(sp, 32));
 4627     __ stpd(v14, v15, Address(sp, 48));
 4628 
 4629     Label sha512_loop;
 4630 
 4631     // load state
 4632     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4633 
 4634     // load first 4 round constants
 4635     __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
 4636     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4637 
 4638     __ BIND(sha512_loop);
 4639     // load 128B of data into v12..v19
 4640     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4641     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4642     __ rev64(v12, __ T16B, v12);
 4643     __ rev64(v13, __ T16B, v13);
 4644     __ rev64(v14, __ T16B, v14);
 4645     __ rev64(v15, __ T16B, v15);
 4646     __ rev64(v16, __ T16B, v16);
 4647     __ rev64(v17, __ T16B, v17);
 4648     __ rev64(v18, __ T16B, v18);
 4649     __ rev64(v19, __ T16B, v19);
 4650 
 4651     __ mov(rscratch2, rscratch1);
 4652 
 4653     __ mov(v0, __ T16B, v8);
 4654     __ mov(v1, __ T16B, v9);
 4655     __ mov(v2, __ T16B, v10);
 4656     __ mov(v3, __ T16B, v11);
 4657 
 4658     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4659     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4660     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4661     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4662     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4663     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4664     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4665     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4666     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4667     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4668     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4669     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4670     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4671     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4672     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4673     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4674     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4675     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4676     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4677     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4678     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4679     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4680     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4681     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4682     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4683     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4684     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4685     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4686     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4687     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4688     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4689     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4690     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4691     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4692     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4693     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4694     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4695     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4696     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4697     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4698 
 4699     __ addv(v8, __ T2D, v8, v0);
 4700     __ addv(v9, __ T2D, v9, v1);
 4701     __ addv(v10, __ T2D, v10, v2);
 4702     __ addv(v11, __ T2D, v11, v3);
 4703 
 4704     if (multi_block) {
 4705       __ add(ofs, ofs, 128);
 4706       __ cmp(ofs, limit);
 4707       __ br(Assembler::LE, sha512_loop);
 4708       __ mov(c_rarg0, ofs); // return ofs
 4709     }
 4710 
 4711     __ st1(v8, v9, v10, v11, __ T2D, state);
 4712 
 4713     __ ldpd(v14, v15, Address(sp, 48));
 4714     __ ldpd(v12, v13, Address(sp, 32));
 4715     __ ldpd(v10, v11, Address(sp, 16));
 4716     __ ldpd(v8, v9, __ post(sp, 64));
 4717 
 4718     __ ret(lr);
 4719 
 4720     // record the stub entry and end
 4721     store_archive_data(stub_id, start, __ pc());
 4722 
 4723     return start;
 4724   }
 4725 
 4726   // Execute one round of keccak of two computations in parallel.
 4727   // One of the states should be loaded into the lower halves of
 4728   // the vector registers v0-v24, the other should be loaded into
 4729   // the upper halves of those registers. The ld1r instruction loads
 4730   // the round constant into both halves of register v31.
 4731   // Intermediate results c0...c5 and d0...d5 are computed
 4732   // in registers v25...v30.
 4733   // All vector instructions that are used operate on both register
 4734   // halves in parallel.
 4735   // If only a single computation is needed, one can only load the lower halves.
 4736   void keccak_round(Register rscratch1) {
 4737   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4738   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4739   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4740   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4741   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4742   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4743   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4744   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4745   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4746   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4747 
 4748   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4749   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4750   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4751   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4752   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4753 
 4754   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4755   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4756   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4757   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4758   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4759   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4760   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4761   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4762   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4763   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4764   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4765   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4766   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4767   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4768   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4769   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4770   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4771   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4772   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4773   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4774   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4775   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4776   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4777   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4778   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4779 
 4780   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4781   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4782   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4783   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4784   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4785 
 4786   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4787 
 4788   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4789   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4790   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4791   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4792   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4793 
 4794   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4795   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4796   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4797   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4798   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4799 
 4800   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4801   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4802   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4803   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4804   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4805 
 4806   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4807   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4808   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4809   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4810   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4811 
 4812   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4813   }
 4814 
 4815   // Arguments:
 4816   //
 4817   // Inputs:
 4818   //   c_rarg0   - byte[]  source+offset
 4819   //   c_rarg1   - byte[]  SHA.state
 4820   //   c_rarg2   - int     block_size
 4821   //   c_rarg3   - int     offset
 4822   //   c_rarg4   - int     limit
 4823   //
 4824   address generate_sha3_implCompress(StubId stub_id) {
 4825     bool multi_block;
 4826     switch (stub_id) {
 4827     case StubId::stubgen_sha3_implCompress_id:
 4828       multi_block = false;
 4829       break;
 4830     case StubId::stubgen_sha3_implCompressMB_id:
 4831       multi_block = true;
 4832       break;
 4833     default:
 4834       ShouldNotReachHere();
 4835     }
 4836     int entry_count = StubInfo::entry_count(stub_id);
 4837     assert(entry_count == 1, "sanity check");
 4838     address start = load_archive_data(stub_id);
 4839     if (start != nullptr) {
 4840       return start;
 4841     }
 4842     __ align(CodeEntryAlignment);
 4843     StubCodeMark mark(this, stub_id);
 4844     start = __ pc();
 4845 
 4846     Register buf           = c_rarg0;
 4847     Register state         = c_rarg1;
 4848     Register block_size    = c_rarg2;
 4849     Register ofs           = c_rarg3;
 4850     Register limit         = c_rarg4;
 4851 
 4852     Label sha3_loop, rounds24_loop;
 4853     Label sha3_512_or_sha3_384, shake128;
 4854 
 4855     __ stpd(v8, v9, __ pre(sp, -64));
 4856     __ stpd(v10, v11, Address(sp, 16));
 4857     __ stpd(v12, v13, Address(sp, 32));
 4858     __ stpd(v14, v15, Address(sp, 48));
 4859 
 4860     // load state
 4861     __ add(rscratch1, state, 32);
 4862     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4863     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4864     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4865     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4866     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4867     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4868     __ ld1(v24, __ T1D, rscratch1);
 4869 
 4870     __ BIND(sha3_loop);
 4871 
 4872     // 24 keccak rounds
 4873     __ movw(rscratch2, 24);
 4874 
 4875     // load round_constants base
 4876     __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
 4877 
 4878     // load input
 4879     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4880     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4881     __ eor(v0, __ T8B, v0, v25);
 4882     __ eor(v1, __ T8B, v1, v26);
 4883     __ eor(v2, __ T8B, v2, v27);
 4884     __ eor(v3, __ T8B, v3, v28);
 4885     __ eor(v4, __ T8B, v4, v29);
 4886     __ eor(v5, __ T8B, v5, v30);
 4887     __ eor(v6, __ T8B, v6, v31);
 4888 
 4889     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4890     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4891 
 4892     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4893     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4894     __ eor(v7, __ T8B, v7, v25);
 4895     __ eor(v8, __ T8B, v8, v26);
 4896     __ eor(v9, __ T8B, v9, v27);
 4897     __ eor(v10, __ T8B, v10, v28);
 4898     __ eor(v11, __ T8B, v11, v29);
 4899     __ eor(v12, __ T8B, v12, v30);
 4900     __ eor(v13, __ T8B, v13, v31);
 4901 
 4902     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4903     __ eor(v14, __ T8B, v14, v25);
 4904     __ eor(v15, __ T8B, v15, v26);
 4905     __ eor(v16, __ T8B, v16, v27);
 4906 
 4907     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4908     __ andw(c_rarg5, block_size, 48);
 4909     __ cbzw(c_rarg5, rounds24_loop);
 4910 
 4911     __ tbnz(block_size, 5, shake128);
 4912     // block_size == 144, bit5 == 0, SHA3-224
 4913     __ ldrd(v28, __ post(buf, 8));
 4914     __ eor(v17, __ T8B, v17, v28);
 4915     __ b(rounds24_loop);
 4916 
 4917     __ BIND(shake128);
 4918     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4919     __ eor(v17, __ T8B, v17, v28);
 4920     __ eor(v18, __ T8B, v18, v29);
 4921     __ eor(v19, __ T8B, v19, v30);
 4922     __ eor(v20, __ T8B, v20, v31);
 4923     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4924 
 4925     __ BIND(sha3_512_or_sha3_384);
 4926     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4927     __ eor(v7, __ T8B, v7, v25);
 4928     __ eor(v8, __ T8B, v8, v26);
 4929     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4930 
 4931     // SHA3-384
 4932     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4933     __ eor(v9,  __ T8B, v9,  v27);
 4934     __ eor(v10, __ T8B, v10, v28);
 4935     __ eor(v11, __ T8B, v11, v29);
 4936     __ eor(v12, __ T8B, v12, v30);
 4937 
 4938     __ BIND(rounds24_loop);
 4939     __ subw(rscratch2, rscratch2, 1);
 4940 
 4941     keccak_round(rscratch1);
 4942 
 4943     __ cbnzw(rscratch2, rounds24_loop);
 4944 
 4945     if (multi_block) {
 4946       __ add(ofs, ofs, block_size);
 4947       __ cmp(ofs, limit);
 4948       __ br(Assembler::LE, sha3_loop);
 4949       __ mov(c_rarg0, ofs); // return ofs
 4950     }
 4951 
 4952     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4953     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4954     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4955     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4956     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4957     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4958     __ st1(v24, __ T1D, state);
 4959 
 4960     // restore callee-saved registers
 4961     __ ldpd(v14, v15, Address(sp, 48));
 4962     __ ldpd(v12, v13, Address(sp, 32));
 4963     __ ldpd(v10, v11, Address(sp, 16));
 4964     __ ldpd(v8, v9, __ post(sp, 64));
 4965 
 4966     __ ret(lr);
 4967 
 4968     // record the stub entry and end
 4969     store_archive_data(stub_id, start, __ pc());
 4970 
 4971     return start;
 4972   }
 4973 
 4974   // Inputs:
 4975   //   c_rarg0   - long[]  state0
 4976   //   c_rarg1   - long[]  state1
 4977   address generate_double_keccak() {
 4978     StubId stub_id = StubId::stubgen_double_keccak_id;
 4979     int entry_count = StubInfo::entry_count(stub_id);
 4980     assert(entry_count == 1, "sanity check");
 4981     address start = load_archive_data(stub_id);
 4982     if (start != nullptr) {
 4983       return start;
 4984     }
 4985     // Implements the double_keccak() method of the
 4986     // sun.security.provider.SHA3Parallel class
 4987     __ align(CodeEntryAlignment);
 4988     StubCodeMark mark(this, stub_id);
 4989     start = __ pc();
 4990     __ enter();
 4991 
 4992     Register state0        = c_rarg0;
 4993     Register state1        = c_rarg1;
 4994 
 4995     Label rounds24_loop;
 4996 
 4997     // save callee-saved registers
 4998     __ stpd(v8, v9, __ pre(sp, -64));
 4999     __ stpd(v10, v11, Address(sp, 16));
 5000     __ stpd(v12, v13, Address(sp, 32));
 5001     __ stpd(v14, v15, Address(sp, 48));
 5002 
 5003     // load states
 5004     __ add(rscratch1, state0, 32);
 5005     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 5006     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 5007     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 5008     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 5009     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 5010     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 5011     __ ld1(v24, __ D, 0, rscratch1);
 5012     __ add(rscratch1, state1, 32);
 5013     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 5014     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 5015     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 5016     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 5017     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 5018     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 5019     __ ld1(v24, __ D, 1, rscratch1);
 5020 
 5021     // 24 keccak rounds
 5022     __ movw(rscratch2, 24);
 5023 
 5024     // load round_constants base
 5025     __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
 5026 
 5027     __ BIND(rounds24_loop);
 5028     __ subw(rscratch2, rscratch2, 1);
 5029     keccak_round(rscratch1);
 5030     __ cbnzw(rscratch2, rounds24_loop);
 5031 
 5032     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 5033     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 5034     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 5035     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 5036     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 5037     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 5038     __ st1(v24, __ D, 0, state0);
 5039     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 5040     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 5041     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 5042     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 5043     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 5044     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 5045     __ st1(v24, __ D, 1, state1);
 5046 
 5047     // restore callee-saved vector registers
 5048     __ ldpd(v14, v15, Address(sp, 48));
 5049     __ ldpd(v12, v13, Address(sp, 32));
 5050     __ ldpd(v10, v11, Address(sp, 16));
 5051     __ ldpd(v8, v9, __ post(sp, 64));
 5052 
 5053     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5054 
 5055     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 5056     __ ret(lr);
 5057 
 5058     // record the stub entry and end
 5059     store_archive_data(stub_id, start, __ pc());
 5060 
 5061     return start;
 5062   }
 5063 
 5064   // ChaCha20 block function.  This version parallelizes the 32-bit
 5065   // state elements on each of 16 vectors, producing 4 blocks of
 5066   // keystream at a time.
 5067   //
 5068   // state (int[16]) = c_rarg0
 5069   // keystream (byte[256]) = c_rarg1
 5070   // return - number of bytes of produced keystream (always 256)
 5071   //
 5072   // This implementation takes each 32-bit integer from the state
 5073   // array and broadcasts it across all 4 32-bit lanes of a vector register
 5074   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 5075   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 5076   // the quarter round schedule is implemented as outlined in RFC 7539 section
 5077   // 2.3.  However, instead of sequentially processing the 3 quarter round
 5078   // operations represented by one QUARTERROUND function, we instead stack all
 5079   // the adds, xors and left-rotations from the first 4 quarter rounds together
 5080   // and then do the same for the second set of 4 quarter rounds.  This removes
 5081   // some latency that would otherwise be incurred by waiting for an add to
 5082   // complete before performing an xor (which depends on the result of the
 5083   // add), etc. An adjustment happens between the first and second groups of 4
 5084   // quarter rounds, but this is done only in the inputs to the macro functions
 5085   // that generate the assembly instructions - these adjustments themselves are
 5086   // not part of the resulting assembly.
 5087   // The 4 registers v0-v3 are used during the quarter round operations as
 5088   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 5089   // registers become the vectors involved in adding the start state back onto
 5090   // the post-QR working state.  After the adds are complete, each of the 16
 5091   // vectors write their first lane back to the keystream buffer, followed
 5092   // by the second lane from all vectors and so on.
 5093   address generate_chacha20Block_blockpar() {
 5094     StubId stub_id = StubId::stubgen_chacha20Block_id;
 5095     int entry_count = StubInfo::entry_count(stub_id);
 5096     assert(entry_count == 1, "sanity check");
 5097     address start = load_archive_data(stub_id);
 5098     if (start != nullptr) {
 5099       return start;
 5100     }
 5101     Label L_twoRounds, L_cc20_const;
 5102     __ align(CodeEntryAlignment);
 5103     StubCodeMark mark(this, stub_id);
 5104     start = __ pc();
 5105     __ enter();
 5106 
 5107     int i, j;
 5108     const Register state = c_rarg0;
 5109     const Register keystream = c_rarg1;
 5110     const Register loopCtr = r10;
 5111     const Register tmpAddr = r11;
 5112     const FloatRegister ctrAddOverlay = v28;
 5113     const FloatRegister lrot8Tbl = v29;
 5114 
 5115     // Organize SIMD registers in an array that facilitates
 5116     // putting repetitive opcodes into loop structures.  It is
 5117     // important that each grouping of 4 registers is monotonically
 5118     // increasing to support the requirements of multi-register
 5119     // instructions (e.g. ld4r, st4, etc.)
 5120     const FloatRegister workSt[16] = {
 5121          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 5122         v20, v21, v22, v23, v24, v25, v26, v27
 5123     };
 5124 
 5125     // Pull in constant data.  The first 16 bytes are the add overlay
 5126     // which is applied to the vector holding the counter (state[12]).
 5127     // The second 16 bytes is the index register for the 8-bit left
 5128     // rotation tbl instruction.
 5129     __ adr(tmpAddr, L_cc20_const);
 5130     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 5131 
 5132     // Load from memory and interlace across 16 SIMD registers,
 5133     // With each word from memory being broadcast to all lanes of
 5134     // each successive SIMD register.
 5135     //      Addr(0) -> All lanes in workSt[i]
 5136     //      Addr(4) -> All lanes workSt[i + 1], etc.
 5137     __ mov(tmpAddr, state);
 5138     for (i = 0; i < 16; i += 4) {
 5139       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 5140           __ post(tmpAddr, 16));
 5141     }
 5142     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5143 
 5144     // Before entering the loop, create 5 4-register arrays.  These
 5145     // will hold the 4 registers that represent the a/b/c/d fields
 5146     // in the quarter round operation.  For instance the "b" field
 5147     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 5148     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 5149     // since it is part of a diagonal organization.  The aSet and scratch
 5150     // register sets are defined at declaration time because they do not change
 5151     // organization at any point during the 20-round processing.
 5152     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 5153     FloatRegister bSet[4];
 5154     FloatRegister cSet[4];
 5155     FloatRegister dSet[4];
 5156     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 5157 
 5158     // Set up the 10 iteration loop and perform all 8 quarter round ops
 5159     __ mov(loopCtr, 10);
 5160     __ BIND(L_twoRounds);
 5161 
 5162     // Set to columnar organization and do the following 4 quarter-rounds:
 5163     // QUARTERROUND(0, 4, 8, 12)
 5164     // QUARTERROUND(1, 5, 9, 13)
 5165     // QUARTERROUND(2, 6, 10, 14)
 5166     // QUARTERROUND(3, 7, 11, 15)
 5167     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 5168     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 5169     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 5170 
 5171     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5172     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5173     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5174 
 5175     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5176     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5177     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5178 
 5179     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5180     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5181     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5182 
 5183     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5184     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5185     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5186 
 5187     // Set to diagonal organization and do the next 4 quarter-rounds:
 5188     // QUARTERROUND(0, 5, 10, 15)
 5189     // QUARTERROUND(1, 6, 11, 12)
 5190     // QUARTERROUND(2, 7, 8, 13)
 5191     // QUARTERROUND(3, 4, 9, 14)
 5192     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 5193     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 5194     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 5195 
 5196     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5197     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5198     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5199 
 5200     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5201     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5202     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5203 
 5204     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5205     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5206     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5207 
 5208     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5209     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5210     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5211 
 5212     // Decrement and iterate
 5213     __ sub(loopCtr, loopCtr, 1);
 5214     __ cbnz(loopCtr, L_twoRounds);
 5215 
 5216     __ mov(tmpAddr, state);
 5217 
 5218     // Add the starting state back to the post-loop keystream
 5219     // state.  We read/interlace the state array from memory into
 5220     // 4 registers similar to what we did in the beginning.  Then
 5221     // add the counter overlay onto workSt[12] at the end.
 5222     for (i = 0; i < 16; i += 4) {
 5223       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 5224       __ addv(workSt[i], __ T4S, workSt[i], v0);
 5225       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 5226       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 5227       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 5228     }
 5229     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5230 
 5231     // Write working state into the keystream buffer.  This is accomplished
 5232     // by taking the lane "i" from each of the four vectors and writing
 5233     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 5234     // repeating with the next 4 vectors until all 16 vectors have been used.
 5235     // Then move to the next lane and repeat the process until all lanes have
 5236     // been written.
 5237     for (i = 0; i < 4; i++) {
 5238       for (j = 0; j < 16; j += 4) {
 5239         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 5240             __ post(keystream, 16));
 5241       }
 5242     }
 5243 
 5244     __ mov(r0, 256);             // Return length of output keystream
 5245     __ leave();
 5246     __ ret(lr);
 5247 
 5248     // bind label and generate local constant data used by this stub
 5249     // The constant data is broken into two 128-bit segments to be loaded
 5250     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 5251     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 5252     // The second 128-bits is a table constant used for 8-bit left rotations.
 5253     __ BIND(L_cc20_const);
 5254     __ emit_int64(0x0000000100000000UL);
 5255     __ emit_int64(0x0000000300000002UL);
 5256     __ emit_int64(0x0605040702010003UL);
 5257     __ emit_int64(0x0E0D0C0F0A09080BUL);
 5258 
 5259     // record the stub entry and end
 5260     store_archive_data(stub_id, start, __ pc());
 5261 
 5262     return start;
 5263   }
 5264 
 5265   // Helpers to schedule parallel operation bundles across vector
 5266   // register sequences of size 2, 4 or 8.
 5267 
 5268   // Implement various primitive computations across vector sequences
 5269 
 5270   template<int N>
 5271   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5272                const VSeq<N>& v1, const VSeq<N>& v2) {
 5273     // output must not be constant
 5274     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5275     // output cannot overwrite pending inputs
 5276     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5277     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5278     for (int i = 0; i < N; i++) {
 5279       __ addv(v[i], T, v1[i], v2[i]);
 5280     }
 5281   }
 5282 
 5283   template<int N>
 5284   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5285                const VSeq<N>& v1, const VSeq<N>& v2) {
 5286     // output must not be constant
 5287     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5288     // output cannot overwrite pending inputs
 5289     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5290     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5291     for (int i = 0; i < N; i++) {
 5292       __ subv(v[i], T, v1[i], v2[i]);
 5293     }
 5294   }
 5295 
 5296   template<int N>
 5297   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5298                const VSeq<N>& v1, const VSeq<N>& v2) {
 5299     // output must not be constant
 5300     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5301     // output cannot overwrite pending inputs
 5302     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5303     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5304     for (int i = 0; i < N; i++) {
 5305       __ mulv(v[i], T, v1[i], v2[i]);
 5306     }
 5307   }
 5308 
 5309   template<int N>
 5310   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 5311     // output must not be constant
 5312     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5313     // output cannot overwrite pending inputs
 5314     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5315     for (int i = 0; i < N; i++) {
 5316       __ negr(v[i], T, v1[i]);
 5317     }
 5318   }
 5319 
 5320   template<int N>
 5321   void vs_shl(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5322               const VSeq<N>& v1, int shift) {
 5323     // output must not be constant
 5324     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5325     // output cannot overwrite pending inputs
 5326     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5327 
 5328     for (int i = 0; i < N; i++) {
 5329       __ shl(v[i], T, v1[i], shift);
 5330     }
 5331   }
 5332 
 5333   template<int N>
 5334   void vs_ushr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5335                const VSeq<N>& v1, int shift) {
 5336     // output must not be constant
 5337     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5338     // output cannot overwrite pending inputs
 5339     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5340 
 5341     for (int i = 0; i < N; i++) {
 5342       __ ushr(v[i], T, v1[i], shift);
 5343     }
 5344   }
 5345 
 5346   template<int N>
 5347   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5348                const VSeq<N>& v1, int shift) {
 5349     // output must not be constant
 5350     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5351     // output cannot overwrite pending inputs
 5352     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5353     for (int i = 0; i < N; i++) {
 5354       __ sshr(v[i], T, v1[i], shift);
 5355     }
 5356   }
 5357 
 5358   template<int N>
 5359   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5360     // output must not be constant
 5361     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5362     // output cannot overwrite pending inputs
 5363     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5364     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5365     for (int i = 0; i < N; i++) {
 5366       __ andr(v[i], __ T16B, v1[i], v2[i]);
 5367     }
 5368   }
 5369 
 5370   template<int N>
 5371   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const FloatRegister v2) {
 5372     // output must not be constant
 5373     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5374     // output cannot overwrite pending inputs
 5375     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5376     for (int i = 0; i < N; i++) {
 5377       __ andr(v[i], __ T16B, v1[i], v2);
 5378     }
 5379   }
 5380 
 5381   template<int N>
 5382   void vs_eor(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5383     // output must not be constant
 5384     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5385     // output cannot overwrite pending inputs
 5386     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5387     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5388     for (int i = 0; i < N; i++) {
 5389       __ eor(v[i], __ T16B, v1[i], v2[i]);
 5390     }
 5391   }
 5392 
 5393   template<int N>
 5394   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5395     // output must not be constant
 5396     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5397     // output cannot overwrite pending inputs
 5398     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5399     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5400     for (int i = 0; i < N; i++) {
 5401       __ orr(v[i], __ T16B, v1[i], v2[i]);
 5402     }
 5403   }
 5404 
 5405   template<int N>
 5406   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 5407     // output must not be constant
 5408     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5409     // output cannot overwrite pending inputs
 5410     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5411     for (int i = 0; i < N; i++) {
 5412       __ notr(v[i], __ T16B, v1[i]);
 5413     }
 5414   }
 5415 
 5416   template<int N>
 5417   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 5418     // output must not be constant
 5419     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5420     // output cannot overwrite pending inputs
 5421     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5422     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5423     for (int i = 0; i < N; i++) {
 5424       __ sqdmulh(v[i], T, v1[i], v2[i]);
 5425     }
 5426   }
 5427 
 5428   template<int N>
 5429   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 5430     // output must not be constant
 5431     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5432     // output cannot overwrite pending inputs
 5433     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5434     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5435     for (int i = 0; i < N; i++) {
 5436       __ mlsv(v[i], T, v1[i], v2[i]);
 5437     }
 5438   }
 5439 
 5440   // load N/2 successive pairs of quadword values from memory in order
 5441   // into N successive vector registers of the sequence via the
 5442   // address supplied in base.
 5443   template<int N>
 5444   void vs_ldpq(const VSeq<N>& v, Register base) {
 5445     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5446     for (int i = 0; i < N; i += 2) {
 5447       __ ldpq(v[i], v[i+1], Address(base, 16 * i));
 5448     }
 5449   }
 5450 
 5451   // load N/2 successive pairs of quadword values from memory in order
 5452   // into N vector registers of the sequence via the address supplied
 5453   // in base using post-increment addressing
 5454   template<int N>
 5455   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 5456     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5457     for (int i = 0; i < N; i += 2) {
 5458       __ ldpq(v[i], v[i+1], __ post(base, 32));
 5459     }
 5460   }
 5461 
 5462   // store N successive vector registers of the sequence into N/2
 5463   // successive pairs of quadword memory locations via the address
 5464   // supplied in base using post-increment addressing
 5465   template<int N>
 5466   void vs_stpq_post(const VSeq<N>& v, Register base) {
 5467     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5468     for (int i = 0; i < N; i += 2) {
 5469       __ stpq(v[i], v[i+1], __ post(base, 32));
 5470     }
 5471   }
 5472 
 5473   // load N/2 pairs of quadword values from memory de-interleaved into
 5474   // N vector registers 2 at a time via the address supplied in base
 5475   // using post-increment addressing.
 5476   template<int N>
 5477   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5478     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5479     for (int i = 0; i < N; i += 2) {
 5480       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 5481     }
 5482   }
 5483 
 5484   // store N vector registers interleaved into N/2 pairs of quadword
 5485   // memory locations via the address supplied in base using
 5486   // post-increment addressing.
 5487   template<int N>
 5488   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5489     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5490     for (int i = 0; i < N; i += 2) {
 5491       __ st2(v[i], v[i+1], T, __ post(base, 32));
 5492     }
 5493   }
 5494 
 5495   // store two vector register sequences of length N
 5496   // interleaved into N pairs of quadword memory locations
 5497   // starting at the address supplied in dest using
 5498   // post-increment addressing.
 5499   template<int N>
 5500   void vs_st1_interleaved(VSeq<N> A, VSeq<N> B, Register dest) {
 5501     for (int i = 0; i < N; i++) {
 5502       __ st1(A[i], __ T2D, __ post(dest, 16));
 5503       __ st1(B[i], __ T2D, __ post(dest, 16));
 5504     }
 5505   }
 5506 
 5507   // load N quadword values from memory de-interleaved into N vector
 5508   // registers 3 elements at a time via the address supplied in base.
 5509   template<int N>
 5510   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5511     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5512     for (int i = 0; i < N; i += 3) {
 5513       __ ld3(v[i], v[i+1], v[i+2], T, base);
 5514     }
 5515   }
 5516 
 5517   // load N quadword values from memory de-interleaved into N vector
 5518   // registers 3 elements at a time via the address supplied in base
 5519   // using post-increment addressing.
 5520   template<int N>
 5521   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5522     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5523     for (int i = 0; i < N; i += 3) {
 5524       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 5525     }
 5526   }
 5527 
 5528   // load N/2 pairs of quadword values from memory into N vector
 5529   // registers via the address supplied in base with each pair indexed
 5530   // using the start offset plus the corresponding entry in the
 5531   // offsets array
 5532   template<int N>
 5533   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 5534     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5535     for (int i = 0; i < N/2; i++) {
 5536       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5537     }
 5538   }
 5539 
 5540   // store N vector registers into N/2 pairs of quadword memory
 5541   // locations via the address supplied in base with each pair indexed
 5542   // using the start offset plus the corresponding entry in the
 5543   // offsets array
 5544   template<int N>
 5545   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 5546     for (int i = 0; i < N/2; i++) {
 5547       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5548     }
 5549   }
 5550 
 5551   // load N single quadword values from memory into N vector registers
 5552   // via the address supplied in base with each value indexed using
 5553   // the start offset plus the corresponding entry in the offsets
 5554   // array
 5555   template<int N>
 5556   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5557                       int start, int (&offsets)[N]) {
 5558     for (int i = 0; i < N; i++) {
 5559       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5560     }
 5561   }
 5562 
 5563   // store N vector registers into N single quadword memory locations
 5564   // via the address supplied in base with each value indexed using
 5565   // the start offset plus the corresponding entry in the offsets
 5566   // array
 5567   template<int N>
 5568   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5569                       int start, int (&offsets)[N]) {
 5570     for (int i = 0; i < N; i++) {
 5571       __ str(v[i], T, Address(base, start + offsets[i]));
 5572     }
 5573   }
 5574 
 5575   // load N/2 pairs of quadword values from memory de-interleaved into
 5576   // N vector registers 2 at a time via the address supplied in base
 5577   // with each pair indexed using the start offset plus the
 5578   // corresponding entry in the offsets array
 5579   template<int N>
 5580   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5581                       Register tmp, int start, int (&offsets)[N/2]) {
 5582     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5583     for (int i = 0; i < N/2; i++) {
 5584       __ add(tmp, base, start + offsets[i]);
 5585       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5586     }
 5587   }
 5588 
 5589   // store N vector registers 2 at a time interleaved into N/2 pairs
 5590   // of quadword memory locations via the address supplied in base
 5591   // with each pair indexed using the start offset plus the
 5592   // corresponding entry in the offsets array
 5593   template<int N>
 5594   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5595                       Register tmp, int start, int (&offsets)[N/2]) {
 5596     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5597     for (int i = 0; i < N/2; i++) {
 5598       __ add(tmp, base, start + offsets[i]);
 5599       __ st2(v[2*i], v[2*i+1], T, tmp);
 5600     }
 5601   }
 5602 
 5603   // Helper routines for various flavours of Montgomery multiply
 5604 
 5605   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5606   // multiplications in parallel
 5607   //
 5608 
 5609   // See the montMul() method of the sun.security.provider.ML_DSA
 5610   // class.
 5611   //
 5612   // Computes 4x4S results or 8x8H results
 5613   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5614   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5615   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5616   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5617   // Outputs: va - 4x4S or 4x8H vector register sequences
 5618   // vb, vc, vtmp and vq must all be disjoint
 5619   // va must be disjoint from all other inputs/temps or must equal vc
 5620   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5621   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5622   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5623                    Assembler::SIMD_Arrangement T,
 5624                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5625     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5626     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5627     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5628     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5629 
 5630     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5631     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5632 
 5633     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5634 
 5635     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5636     assert(vs_disjoint(va, vb), "va and vb overlap");
 5637     assert(vs_disjoint(va, vq), "va and vq overlap");
 5638     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5639     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5640 
 5641     // schedule 4 streams of instructions across the vector sequences
 5642     for (int i = 0; i < 4; i++) {
 5643       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5644       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5645     }
 5646 
 5647     for (int i = 0; i < 4; i++) {
 5648       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5649     }
 5650 
 5651     for (int i = 0; i < 4; i++) {
 5652       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5653     }
 5654 
 5655     for (int i = 0; i < 4; i++) {
 5656       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5657     }
 5658   }
 5659 
 5660   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5661   // multiplications in parallel
 5662   //
 5663 
 5664   // See the montMul() method of the sun.security.provider.ML_DSA
 5665   // class.
 5666   //
 5667   // Computes 4x4S results or 8x8H results
 5668   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5669   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5670   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5671   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5672   // Outputs: va - 4x4S or 4x8H vector register sequences
 5673   // vb, vc, vtmp and vq must all be disjoint
 5674   // va must be disjoint from all other inputs/temps or must equal vc
 5675   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5676   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5677   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5678                    Assembler::SIMD_Arrangement T,
 5679                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5680     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5681     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5682     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5683     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5684 
 5685     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5686     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5687 
 5688     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5689 
 5690     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5691     assert(vs_disjoint(va, vb), "va and vb overlap");
 5692     assert(vs_disjoint(va, vq), "va and vq overlap");
 5693     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5694     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5695 
 5696     // schedule 2 streams of instructions across the vector sequences
 5697     for (int i = 0; i < 2; i++) {
 5698       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5699       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5700     }
 5701 
 5702     for (int i = 0; i < 2; i++) {
 5703       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5704     }
 5705 
 5706     for (int i = 0; i < 2; i++) {
 5707       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5708     }
 5709 
 5710     for (int i = 0; i < 2; i++) {
 5711       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5712     }
 5713   }
 5714 
 5715   // Perform 16 16-bit Montgomery multiplications in parallel.
 5716   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5717                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5718     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5719     // It will assert that the register use is valid
 5720     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5721   }
 5722 
 5723   // Perform 32 16-bit Montgomery multiplications in parallel.
 5724   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5725                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5726     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5727     // It will assert that the register use is valid
 5728     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5729   }
 5730 
 5731   // Perform 64 16-bit Montgomery multiplications in parallel.
 5732   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5733                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5734     // Schedule two successive 4x8H multiplies via the montmul helper
 5735     // on the front and back halves of va, vb and vc. The helper will
 5736     // assert that the register use has no overlap conflicts on each
 5737     // individual call but we also need to ensure that the necessary
 5738     // disjoint/equality constraints are met across both calls.
 5739 
 5740     // vb, vc, vtmp and vq must be disjoint. va must either be
 5741     // disjoint from all other registers or equal vc
 5742 
 5743     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5744     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5745     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5746 
 5747     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5748     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5749 
 5750     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5751 
 5752     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5753     assert(vs_disjoint(va, vb), "va and vb overlap");
 5754     assert(vs_disjoint(va, vq), "va and vq overlap");
 5755     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5756 
 5757     // we multiply the front and back halves of each sequence 4 at a
 5758     // time because
 5759     //
 5760     // 1) we are currently only able to get 4-way instruction
 5761     // parallelism at best
 5762     //
 5763     // 2) we need registers for the constants in vq and temporary
 5764     // scratch registers to hold intermediate results so vtmp can only
 5765     // be a VSeq<4> which means we only have 4 scratch slots
 5766 
 5767     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5768     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5769   }
 5770 
 5771   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5772                                const VSeq<4>& vc,
 5773                                const VSeq<4>& vtmp,
 5774                                const VSeq<2>& vq) {
 5775     // compute a = montmul(a1, c)
 5776     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5777     // ouptut a1 = a0 - a
 5778     vs_subv(va1, __ T8H, va0, vc);
 5779     //    and a0 = a0 + a
 5780     vs_addv(va0, __ T8H, va0, vc);
 5781   }
 5782 
 5783   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5784                                const VSeq<4>& vb,
 5785                                const VSeq<4>& vtmp1,
 5786                                const VSeq<4>& vtmp2,
 5787                                const VSeq<2>& vq) {
 5788     // compute c = a0 - a1
 5789     vs_subv(vtmp1, __ T8H, va0, va1);
 5790     // output a0 = a0 + a1
 5791     vs_addv(va0, __ T8H, va0, va1);
 5792     // output a1 = b montmul c
 5793     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5794   }
 5795 
 5796   void load64shorts(const VSeq<8>& v, Register shorts) {
 5797     vs_ldpq_post(v, shorts);
 5798   }
 5799 
 5800   void load32shorts(const VSeq<4>& v, Register shorts) {
 5801     vs_ldpq_post(v, shorts);
 5802   }
 5803 
 5804   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5805     vs_stpq_post(v, tmpAddr);
 5806   }
 5807 
 5808   // Kyber NTT function.
 5809   // Implements
 5810   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5811   //
 5812   // coeffs (short[256]) = c_rarg0
 5813   // ntt_zetas (short[256]) = c_rarg1
 5814   address generate_kyberNtt() {
 5815     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5816     int entry_count = StubInfo::entry_count(stub_id);
 5817     assert(entry_count == 1, "sanity check");
 5818     address start = load_archive_data(stub_id);
 5819     if (start != nullptr) {
 5820       return start;
 5821     }
 5822     __ align(CodeEntryAlignment);
 5823     StubCodeMark mark(this, stub_id);
 5824     start = __ pc();
 5825     __ enter();
 5826 
 5827     const Register coeffs = c_rarg0;
 5828     const Register zetas = c_rarg1;
 5829 
 5830     const Register kyberConsts = r10;
 5831     const Register tmpAddr = r11;
 5832 
 5833     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5834     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5835     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5836 
 5837     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5838     // load the montmul constants
 5839     vs_ldpq(vq, kyberConsts);
 5840 
 5841     // Each level corresponds to an iteration of the outermost loop of the
 5842     // Java method seilerNTT(int[] coeffs). There are some differences
 5843     // from what is done in the seilerNTT() method, though:
 5844     // 1. The computation is using 16-bit signed values, we do not convert them
 5845     // to ints here.
 5846     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5847     // this array for each level, it is easier that way to fill up the vector
 5848     // registers.
 5849     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5850     // multiplications (this is because that way there should not be any
 5851     // overflow during the inverse NTT computation), here we use R = 2^16 so
 5852     // that we can use the 16-bit arithmetic in the vector unit.
 5853     //
 5854     // On each level, we fill up the vector registers in such a way that the
 5855     // array elements that need to be multiplied by the zetas go into one
 5856     // set of vector registers while the corresponding ones that don't need to
 5857     // be multiplied, go into another set.
 5858     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5859     // registers interleaving the steps of 4 identical computations,
 5860     // each done on 8 16-bit values per register.
 5861 
 5862     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5863     // to the zetas occur in discrete blocks whose size is some multiple
 5864     // of 32.
 5865 
 5866     // level 0
 5867     __ add(tmpAddr, coeffs, 256);
 5868     load64shorts(vs1, tmpAddr);
 5869     load64shorts(vs2, zetas);
 5870     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5871     __ add(tmpAddr, coeffs, 0);
 5872     load64shorts(vs1, tmpAddr);
 5873     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5874     vs_addv(vs1, __ T8H, vs1, vs2);
 5875     __ add(tmpAddr, coeffs, 0);
 5876     vs_stpq_post(vs1, tmpAddr);
 5877     __ add(tmpAddr, coeffs, 256);
 5878     vs_stpq_post(vs3, tmpAddr);
 5879     // restore montmul constants
 5880     vs_ldpq(vq, kyberConsts);
 5881     load64shorts(vs1, tmpAddr);
 5882     load64shorts(vs2, zetas);
 5883     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5884     __ add(tmpAddr, coeffs, 128);
 5885     load64shorts(vs1, tmpAddr);
 5886     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5887     vs_addv(vs1, __ T8H, vs1, vs2);
 5888     __ add(tmpAddr, coeffs, 128);
 5889     store64shorts(vs1, tmpAddr);
 5890     __ add(tmpAddr, coeffs, 384);
 5891     store64shorts(vs3, tmpAddr);
 5892 
 5893     // level 1
 5894     // restore montmul constants
 5895     vs_ldpq(vq, kyberConsts);
 5896     __ add(tmpAddr, coeffs, 128);
 5897     load64shorts(vs1, tmpAddr);
 5898     load64shorts(vs2, zetas);
 5899     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5900     __ add(tmpAddr, coeffs, 0);
 5901     load64shorts(vs1, tmpAddr);
 5902     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5903     vs_addv(vs1, __ T8H, vs1, vs2);
 5904     __ add(tmpAddr, coeffs, 0);
 5905     store64shorts(vs1, tmpAddr);
 5906     store64shorts(vs3, tmpAddr);
 5907     vs_ldpq(vq, kyberConsts);
 5908     __ add(tmpAddr, coeffs, 384);
 5909     load64shorts(vs1, tmpAddr);
 5910     load64shorts(vs2, zetas);
 5911     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5912     __ add(tmpAddr, coeffs, 256);
 5913     load64shorts(vs1, tmpAddr);
 5914     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5915     vs_addv(vs1, __ T8H, vs1, vs2);
 5916     __ add(tmpAddr, coeffs, 256);
 5917     store64shorts(vs1, tmpAddr);
 5918     store64shorts(vs3, tmpAddr);
 5919 
 5920     // level 2
 5921     vs_ldpq(vq, kyberConsts);
 5922     int offsets1[4] = { 0, 32, 128, 160 };
 5923     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5924     load64shorts(vs2, zetas);
 5925     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5926     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5927     // kyber_subv_addv64();
 5928     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5929     vs_addv(vs1, __ T8H, vs1, vs2);
 5930     __ add(tmpAddr, coeffs, 0);
 5931     vs_stpq_post(vs_front(vs1), tmpAddr);
 5932     vs_stpq_post(vs_front(vs3), tmpAddr);
 5933     vs_stpq_post(vs_back(vs1), tmpAddr);
 5934     vs_stpq_post(vs_back(vs3), tmpAddr);
 5935     vs_ldpq(vq, kyberConsts);
 5936     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5937     load64shorts(vs2, zetas);
 5938     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5939     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5940     // kyber_subv_addv64();
 5941     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5942     vs_addv(vs1, __ T8H, vs1, vs2);
 5943     __ add(tmpAddr, coeffs, 256);
 5944     vs_stpq_post(vs_front(vs1), tmpAddr);
 5945     vs_stpq_post(vs_front(vs3), tmpAddr);
 5946     vs_stpq_post(vs_back(vs1), tmpAddr);
 5947     vs_stpq_post(vs_back(vs3), tmpAddr);
 5948 
 5949     // level 3
 5950     vs_ldpq(vq, kyberConsts);
 5951     int offsets2[4] = { 0, 64, 128, 192 };
 5952     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5953     load64shorts(vs2, zetas);
 5954     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5955     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5956     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5957     vs_addv(vs1, __ T8H, vs1, vs2);
 5958     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5959     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5960 
 5961     vs_ldpq(vq, kyberConsts);
 5962     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5963     load64shorts(vs2, zetas);
 5964     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5965     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5966     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5967     vs_addv(vs1, __ T8H, vs1, vs2);
 5968     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5969     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5970 
 5971     // level 4
 5972     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5973     // so they are loaded by employing an ldr at 8 distinct offsets.
 5974 
 5975     vs_ldpq(vq, kyberConsts);
 5976     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5977     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5978     load64shorts(vs2, zetas);
 5979     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5980     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5981     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5982     vs_addv(vs1, __ T8H, vs1, vs2);
 5983     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5984     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5985 
 5986     vs_ldpq(vq, kyberConsts);
 5987     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5988     load64shorts(vs2, zetas);
 5989     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5990     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5991     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5992     vs_addv(vs1, __ T8H, vs1, vs2);
 5993     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5994     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5995 
 5996     // level 5
 5997     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5998     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5999 
 6000     vs_ldpq(vq, kyberConsts);
 6001     int offsets4[4] = { 0, 32, 64, 96 };
 6002     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6003     load32shorts(vs_front(vs2), zetas);
 6004     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6005     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6006     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6007     load32shorts(vs_front(vs2), zetas);
 6008     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6009     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6010     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6011     load32shorts(vs_front(vs2), zetas);
 6012     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6013     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6014 
 6015     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6016     load32shorts(vs_front(vs2), zetas);
 6017     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6018     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6019 
 6020     // level 6
 6021     // At level 6 related coefficients occur in discrete blocks of size 4 so
 6022     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 6023 
 6024     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6025     load32shorts(vs_front(vs2), zetas);
 6026     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6027     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6028     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6029     load32shorts(vs_front(vs2), zetas);
 6030     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6031     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6032 
 6033     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6034     load32shorts(vs_front(vs2), zetas);
 6035     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6036     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6037 
 6038     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6039     load32shorts(vs_front(vs2), zetas);
 6040     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6041     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6042 
 6043     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6044     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6045     __ ret(lr);
 6046 
 6047     // record the stub entry and end
 6048     store_archive_data(stub_id, start, __ pc());
 6049 
 6050     return start;
 6051   }
 6052 
 6053   // Kyber Inverse NTT function
 6054   // Implements
 6055   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 6056   //
 6057   // coeffs (short[256]) = c_rarg0
 6058   // ntt_zetas (short[256]) = c_rarg1
 6059   address generate_kyberInverseNtt() {
 6060     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 6061     int entry_count = StubInfo::entry_count(stub_id);
 6062     assert(entry_count == 1, "sanity check");
 6063     address start = load_archive_data(stub_id);
 6064     if (start != nullptr) {
 6065       return start;
 6066     }
 6067     __ align(CodeEntryAlignment);
 6068     StubCodeMark mark(this, stub_id);
 6069     start = __ pc();
 6070     __ enter();
 6071 
 6072     const Register coeffs = c_rarg0;
 6073     const Register zetas = c_rarg1;
 6074 
 6075     const Register kyberConsts = r10;
 6076     const Register tmpAddr = r11;
 6077     const Register tmpAddr2 = c_rarg2;
 6078 
 6079     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 6080     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6081     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6082 
 6083     __ lea(kyberConsts,
 6084              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6085 
 6086     // level 0
 6087     // At level 0 related coefficients occur in discrete blocks of size 4 so
 6088     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 6089 
 6090     vs_ldpq(vq, kyberConsts);
 6091     int offsets4[4] = { 0, 32, 64, 96 };
 6092     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6093     load32shorts(vs_front(vs2), zetas);
 6094     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6095                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6096     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6097     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6098     load32shorts(vs_front(vs2), zetas);
 6099     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6100                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6101     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6102     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6103     load32shorts(vs_front(vs2), zetas);
 6104     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6105                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6106     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6107     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6108     load32shorts(vs_front(vs2), zetas);
 6109     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6110                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6111     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6112 
 6113     // level 1
 6114     // At level 1 related coefficients occur in discrete blocks of size 8 so
 6115     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 6116 
 6117     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6118     load32shorts(vs_front(vs2), zetas);
 6119     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6120                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6121     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6122     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6123     load32shorts(vs_front(vs2), zetas);
 6124     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6125                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6126     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6127 
 6128     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6129     load32shorts(vs_front(vs2), zetas);
 6130     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6131                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6132     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6133     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6134     load32shorts(vs_front(vs2), zetas);
 6135     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6136                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6137     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6138 
 6139     // level 2
 6140     // At level 2 coefficients occur in 8 discrete blocks of size 16
 6141     // so they are loaded by employing an ldr at 8 distinct offsets.
 6142 
 6143     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6144     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6145     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6146     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6147     vs_subv(vs1, __ T8H, vs1, vs2);
 6148     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 6149     load64shorts(vs2, zetas);
 6150     vs_ldpq(vq, kyberConsts);
 6151     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6152     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6153 
 6154     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6155     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6156     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6157     vs_subv(vs1, __ T8H, vs1, vs2);
 6158     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 6159     load64shorts(vs2, zetas);
 6160     vs_ldpq(vq, kyberConsts);
 6161     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6162     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6163 
 6164     // Barrett reduction at indexes where overflow may happen
 6165 
 6166     // load q and the multiplier for the Barrett reduction
 6167     __ add(tmpAddr, kyberConsts, 16);
 6168     vs_ldpq(vq, tmpAddr);
 6169 
 6170     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 6171     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 6172     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 6173     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6174     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6175     vs_sshr(vs2, __ T8H, vs2, 11);
 6176     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6177     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6178     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6179     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6180     vs_sshr(vs2, __ T8H, vs2, 11);
 6181     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6182     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6183 
 6184     // level 3
 6185     // From level 3 upwards coefficients occur in discrete blocks whose size is
 6186     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 6187 
 6188     int offsets2[4] = { 0, 64, 128, 192 };
 6189     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 6190     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 6191     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6192     vs_subv(vs1, __ T8H, vs1, vs2);
 6193     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 6194     load64shorts(vs2, zetas);
 6195     vs_ldpq(vq, kyberConsts);
 6196     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6197     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 6198 
 6199     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 6200     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6201     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6202     vs_subv(vs1, __ T8H, vs1, vs2);
 6203     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 6204     load64shorts(vs2, zetas);
 6205     vs_ldpq(vq, kyberConsts);
 6206     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6207     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6208 
 6209     // level 4
 6210 
 6211     int offsets1[4] = { 0, 32, 128, 160 };
 6212     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 6213     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 6214     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6215     vs_subv(vs1, __ T8H, vs1, vs2);
 6216     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 6217     load64shorts(vs2, zetas);
 6218     vs_ldpq(vq, kyberConsts);
 6219     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6220     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 6221 
 6222     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 6223     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6224     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6225     vs_subv(vs1, __ T8H, vs1, vs2);
 6226     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 6227     load64shorts(vs2, zetas);
 6228     vs_ldpq(vq, kyberConsts);
 6229     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6230     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6231 
 6232     // level 5
 6233 
 6234     __ add(tmpAddr, coeffs, 0);
 6235     load64shorts(vs1, tmpAddr);
 6236     __ add(tmpAddr, coeffs, 128);
 6237     load64shorts(vs2, tmpAddr);
 6238     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6239     vs_subv(vs1, __ T8H, vs1, vs2);
 6240     __ add(tmpAddr, coeffs, 0);
 6241     store64shorts(vs3, tmpAddr);
 6242     load64shorts(vs2, zetas);
 6243     vs_ldpq(vq, kyberConsts);
 6244     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6245     __ add(tmpAddr, coeffs, 128);
 6246     store64shorts(vs2, tmpAddr);
 6247 
 6248     load64shorts(vs1, tmpAddr);
 6249     __ add(tmpAddr, coeffs, 384);
 6250     load64shorts(vs2, tmpAddr);
 6251     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6252     vs_subv(vs1, __ T8H, vs1, vs2);
 6253     __ add(tmpAddr, coeffs, 256);
 6254     store64shorts(vs3, tmpAddr);
 6255     load64shorts(vs2, zetas);
 6256     vs_ldpq(vq, kyberConsts);
 6257     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6258     __ add(tmpAddr, coeffs, 384);
 6259     store64shorts(vs2, tmpAddr);
 6260 
 6261     // Barrett reduction at indexes where overflow may happen
 6262 
 6263     // load q and the multiplier for the Barrett reduction
 6264     __ add(tmpAddr, kyberConsts, 16);
 6265     vs_ldpq(vq, tmpAddr);
 6266 
 6267     int offsets0[2] = { 0, 256 };
 6268     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6269     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6270     vs_sshr(vs2, __ T8H, vs2, 11);
 6271     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6272     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6273 
 6274     // level 6
 6275 
 6276     __ add(tmpAddr, coeffs, 0);
 6277     load64shorts(vs1, tmpAddr);
 6278     __ add(tmpAddr, coeffs, 256);
 6279     load64shorts(vs2, tmpAddr);
 6280     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6281     vs_subv(vs1, __ T8H, vs1, vs2);
 6282     __ add(tmpAddr, coeffs, 0);
 6283     store64shorts(vs3, tmpAddr);
 6284     load64shorts(vs2, zetas);
 6285     vs_ldpq(vq, kyberConsts);
 6286     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6287     __ add(tmpAddr, coeffs, 256);
 6288     store64shorts(vs2, tmpAddr);
 6289 
 6290     __ add(tmpAddr, coeffs, 128);
 6291     load64shorts(vs1, tmpAddr);
 6292     __ add(tmpAddr, coeffs, 384);
 6293     load64shorts(vs2, tmpAddr);
 6294     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6295     vs_subv(vs1, __ T8H, vs1, vs2);
 6296     __ add(tmpAddr, coeffs, 128);
 6297     store64shorts(vs3, tmpAddr);
 6298     load64shorts(vs2, zetas);
 6299     vs_ldpq(vq, kyberConsts);
 6300     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6301     __ add(tmpAddr, coeffs, 384);
 6302     store64shorts(vs2, tmpAddr);
 6303 
 6304     // multiply by 2^-n
 6305 
 6306     // load toMont(2^-n mod q)
 6307     __ add(tmpAddr, kyberConsts, 48);
 6308     __ ldr(v29, __ Q, tmpAddr);
 6309 
 6310     vs_ldpq(vq, kyberConsts);
 6311     __ add(tmpAddr, coeffs, 0);
 6312     load64shorts(vs1, tmpAddr);
 6313     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6314     __ add(tmpAddr, coeffs, 0);
 6315     store64shorts(vs2, tmpAddr);
 6316 
 6317     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 6318     load64shorts(vs1, tmpAddr);
 6319     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6320     __ add(tmpAddr, coeffs, 128);
 6321     store64shorts(vs2, tmpAddr);
 6322 
 6323     // now tmpAddr contains coeffs + 256
 6324     load64shorts(vs1, tmpAddr);
 6325     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6326     __ add(tmpAddr, coeffs, 256);
 6327     store64shorts(vs2, tmpAddr);
 6328 
 6329     // now tmpAddr contains coeffs + 384
 6330     load64shorts(vs1, tmpAddr);
 6331     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6332     __ add(tmpAddr, coeffs, 384);
 6333     store64shorts(vs2, tmpAddr);
 6334 
 6335     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6336     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6337     __ ret(lr);
 6338 
 6339     // record the stub entry and end
 6340     store_archive_data(stub_id, start, __ pc());
 6341 
 6342     return start;
 6343   }
 6344 
 6345   // Kyber multiply polynomials in the NTT domain.
 6346   // Implements
 6347   // static int implKyberNttMult(
 6348   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 6349   //
 6350   // The actual algorithm that is used here differs from the one in the Java
 6351   // implementation, it uses Montgomery multiplications instead of Barrett
 6352   // reduction, but the end result modulo MLKEM_Q is the same. This is the
 6353   // Java equivalent of this intrinsic implementation:
 6354   // static void implKyberNttMultJava(short[] result, short[] ntta, short[] nttb) {
 6355   //         for (int m = 0; m < ML_KEM_N / 2; m++) {
 6356   //             int a0 = ntta[2 * m];
 6357   //             int a1 = ntta[2 * m + 1];
 6358   //             int b0 = nttb[2 * m];
 6359   //             int b1 = nttb[2 * m + 1];
 6360   //             int r = montMul(a0, b0) +
 6361   //                     montMul(montMul(a1, b1), MONT_ZETAS_FOR_NTT_MULT[m]);
 6362   //             result[2 * m] = (short) montMul(r, MONT_R_SQUARE_MOD_Q);
 6363   //             result[2 * m + 1] = (short) montMul(
 6364   //                     (montMul(a0, b1) + montMul(a1, b0)), MONT_R_SQUARE_MOD_Q);
 6365   //          }
 6366   // }
 6367   //
 6368   // result (short[256]) = c_rarg0
 6369   // ntta (short[256]) = c_rarg1
 6370   // nttb (short[256]) = c_rarg2
 6371   // zetas (short[128]) = c_rarg3
 6372   address generate_kyberNttMult() {
 6373     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 6374     int entry_count = StubInfo::entry_count(stub_id);
 6375     assert(entry_count == 1, "sanity check");
 6376     address start = load_archive_data(stub_id);
 6377     if (start != nullptr) {
 6378       return start;
 6379     }
 6380     __ align(CodeEntryAlignment);
 6381     StubCodeMark mark(this, stub_id);
 6382     start = __ pc();
 6383     __ enter();
 6384 
 6385     const Register result = c_rarg0;
 6386     const Register ntta = c_rarg1;
 6387     const Register nttb = c_rarg2;
 6388     const Register zetas = c_rarg3;
 6389 
 6390     const Register kyberConsts = r10;
 6391     const Register limit = r11;
 6392 
 6393     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 6394     VSeq<4> vs3(16), vs4(20);
 6395     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 6396     VSeq<2> vz(28);          // pair of zetas
 6397     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 6398 
 6399     __ lea(kyberConsts,
 6400              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6401 
 6402     Label kyberNttMult_loop;
 6403 
 6404     __ add(limit, result, 512);
 6405 
 6406     // load q and qinv
 6407     vs_ldpq(vq, kyberConsts);
 6408 
 6409     // load R^2 mod q (to convert back from Montgomery representation)
 6410     __ add(kyberConsts, kyberConsts, 64);
 6411     __ ldr(v27, __ Q, kyberConsts);
 6412 
 6413     __ BIND(kyberNttMult_loop);
 6414 
 6415     // load 16 zetas
 6416     vs_ldpq_post(vz, zetas);
 6417 
 6418     // load 2 sets of 32 coefficients from the two input arrays
 6419     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 6420     // are striped across pairs of vector registers
 6421     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 6422     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 6423     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 6424     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 6425 
 6426     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 6427     // i.e. montmul the first and second halves of vs1 in order and
 6428     // then with one sequence reversed storing the two results in vs3
 6429     //
 6430     // vs3[0] <- montmul(a0, b0)
 6431     // vs3[1] <- montmul(a1, b1)
 6432     // vs3[2] <- montmul(a0, b1)
 6433     // vs3[3] <- montmul(a1, b0)
 6434     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 6435     kyber_montmul16(vs_back(vs3),
 6436                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 6437 
 6438     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 6439     // i.e. montmul the first and second halves of vs4 in order and
 6440     // then with one sequence reversed storing the two results in vs1
 6441     //
 6442     // vs1[0] <- montmul(a2, b2)
 6443     // vs1[1] <- montmul(a3, b3)
 6444     // vs1[2] <- montmul(a2, b3)
 6445     // vs1[3] <- montmul(a3, b2)
 6446     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 6447     kyber_montmul16(vs_back(vs1),
 6448                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 6449 
 6450     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 6451     // We can schedule two montmuls at a time if we use a suitable vector
 6452     // sequence <vs3[1], vs1[1]>.
 6453     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 6454     VSeq<2> vs5(vs3[1], delta);
 6455 
 6456     // vs3[1] <- montmul(montmul(a1, b1), z0)
 6457     // vs1[1] <- montmul(montmul(a3, b3), z1)
 6458     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 6459 
 6460     // add results in pairs storing in vs3
 6461     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 6462     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 6463     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 6464 
 6465     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 6466     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 6467     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 6468 
 6469     // vs1 <- montmul(vs3, montRSquareModQ)
 6470     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 6471 
 6472     // store back the two pairs of result vectors de-interleaved as 8H elements
 6473     // i.e. storing each pairs of shorts striped across a register pair adjacent
 6474     // in memory
 6475     vs_st2_post(vs1, __ T8H, result);
 6476 
 6477     __ cmp(result, limit);
 6478     __ br(Assembler::NE, kyberNttMult_loop);
 6479 
 6480     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6481     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6482     __ ret(lr);
 6483 
 6484     // record the stub entry and end
 6485     store_archive_data(stub_id, start, __ pc());
 6486 
 6487     return start;
 6488   }
 6489 
 6490   // Kyber add 2 polynomials.
 6491   // Implements
 6492   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 6493   //
 6494   // result (short[256]) = c_rarg0
 6495   // a (short[256]) = c_rarg1
 6496   // b (short[256]) = c_rarg2
 6497   address generate_kyberAddPoly_2() {
 6498     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 6499     int entry_count = StubInfo::entry_count(stub_id);
 6500     assert(entry_count == 1, "sanity check");
 6501     address start = load_archive_data(stub_id);
 6502     if (start != nullptr) {
 6503       return start;
 6504     }
 6505     __ align(CodeEntryAlignment);
 6506     StubCodeMark mark(this, stub_id);
 6507     start = __ pc();
 6508     __ enter();
 6509 
 6510     const Register result = c_rarg0;
 6511     const Register a = c_rarg1;
 6512     const Register b = c_rarg2;
 6513 
 6514     const Register kyberConsts = r11;
 6515 
 6516     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 6517     // So, we can load, add and store the data in 3 groups of 11,
 6518     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 6519     // registers. A further constraint is that the mapping needs
 6520     // to skip callee saves. So, we allocate the register
 6521     // sequences using two 8 sequences, two 2 sequences and two
 6522     // single registers.
 6523     VSeq<8> vs1_1(0);
 6524     VSeq<2> vs1_2(16);
 6525     FloatRegister vs1_3 = v28;
 6526     VSeq<8> vs2_1(18);
 6527     VSeq<2> vs2_2(26);
 6528     FloatRegister vs2_3 = v29;
 6529 
 6530     // two constant vector sequences
 6531     VSeq<8> vc_1(31, 0);
 6532     VSeq<2> vc_2(31, 0);
 6533 
 6534     FloatRegister vc_3 = v31;
 6535     __ lea(kyberConsts,
 6536              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6537 
 6538     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6539     for (int i = 0; i < 3; i++) {
 6540       // load 80 or 88 values from a into vs1_1/2/3
 6541       vs_ldpq_post(vs1_1, a);
 6542       vs_ldpq_post(vs1_2, a);
 6543       if (i < 2) {
 6544         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6545       }
 6546       // load 80 or 88 values from b into vs2_1/2/3
 6547       vs_ldpq_post(vs2_1, b);
 6548       vs_ldpq_post(vs2_2, b);
 6549       if (i < 2) {
 6550         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6551       }
 6552       // sum 80 or 88 values across vs1 and vs2 into vs1
 6553       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6554       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6555       if (i < 2) {
 6556         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6557       }
 6558       // add constant to all 80 or 88 results
 6559       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6560       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6561       if (i < 2) {
 6562         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6563       }
 6564       // store 80 or 88 values
 6565       vs_stpq_post(vs1_1, result);
 6566       vs_stpq_post(vs1_2, result);
 6567       if (i < 2) {
 6568         __ str(vs1_3, __ Q, __ post(result, 16));
 6569       }
 6570     }
 6571 
 6572     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6573     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6574     __ ret(lr);
 6575 
 6576     // record the stub entry and end
 6577     store_archive_data(stub_id, start, __ pc());
 6578 
 6579     return start;
 6580   }
 6581 
 6582   // Kyber add 3 polynomials.
 6583   // Implements
 6584   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 6585   //
 6586   // result (short[256]) = c_rarg0
 6587   // a (short[256]) = c_rarg1
 6588   // b (short[256]) = c_rarg2
 6589   // c (short[256]) = c_rarg3
 6590   address generate_kyberAddPoly_3() {
 6591     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 6592     int entry_count = StubInfo::entry_count(stub_id);
 6593     assert(entry_count == 1, "sanity check");
 6594     address start = load_archive_data(stub_id);
 6595     if (start != nullptr) {
 6596       return start;
 6597     }
 6598     __ align(CodeEntryAlignment);
 6599     StubCodeMark mark(this, stub_id);
 6600     start = __ pc();
 6601     __ enter();
 6602 
 6603     const Register result = c_rarg0;
 6604     const Register a = c_rarg1;
 6605     const Register b = c_rarg2;
 6606     const Register c = c_rarg3;
 6607 
 6608     const Register kyberConsts = r11;
 6609 
 6610     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6611     // quadwords.  So, we can load, add and store the data in 3
 6612     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6613     // of 10 or 11 registers. A further constraint is that the
 6614     // mapping needs to skip callee saves. So, we allocate the
 6615     // register sequences using two 8 sequences, two 2 sequences
 6616     // and two single registers.
 6617     VSeq<8> vs1_1(0);
 6618     VSeq<2> vs1_2(16);
 6619     FloatRegister vs1_3 = v28;
 6620     VSeq<8> vs2_1(18);
 6621     VSeq<2> vs2_2(26);
 6622     FloatRegister vs2_3 = v29;
 6623 
 6624     // two constant vector sequences
 6625     VSeq<8> vc_1(31, 0);
 6626     VSeq<2> vc_2(31, 0);
 6627 
 6628     FloatRegister vc_3 = v31;
 6629 
 6630     __ lea(kyberConsts,
 6631              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6632 
 6633     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6634     for (int i = 0; i < 3; i++) {
 6635       // load 80 or 88 values from a into vs1_1/2/3
 6636       vs_ldpq_post(vs1_1, a);
 6637       vs_ldpq_post(vs1_2, a);
 6638       if (i < 2) {
 6639         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6640       }
 6641       // load 80 or 88 values from b into vs2_1/2/3
 6642       vs_ldpq_post(vs2_1, b);
 6643       vs_ldpq_post(vs2_2, b);
 6644       if (i < 2) {
 6645         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6646       }
 6647       // sum 80 or 88 values across vs1 and vs2 into vs1
 6648       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6649       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6650       if (i < 2) {
 6651         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6652       }
 6653       // load 80 or 88 values from c into vs2_1/2/3
 6654       vs_ldpq_post(vs2_1, c);
 6655       vs_ldpq_post(vs2_2, c);
 6656       if (i < 2) {
 6657         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6658       }
 6659       // sum 80 or 88 values across vs1 and vs2 into vs1
 6660       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6661       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6662       if (i < 2) {
 6663         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6664       }
 6665       // add constant to all 80 or 88 results
 6666       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6667       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6668       if (i < 2) {
 6669         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6670       }
 6671       // store 80 or 88 values
 6672       vs_stpq_post(vs1_1, result);
 6673       vs_stpq_post(vs1_2, result);
 6674       if (i < 2) {
 6675         __ str(vs1_3, __ Q, __ post(result, 16));
 6676       }
 6677     }
 6678 
 6679     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6680     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6681     __ ret(lr);
 6682 
 6683     // record the stub entry and end
 6684     store_archive_data(stub_id, start, __ pc());
 6685 
 6686     return start;
 6687   }
 6688 
 6689   // Kyber parse XOF output to polynomial coefficient candidates
 6690   // or decodePoly(12, ...).
 6691   // Implements
 6692   // static int implKyber12To16(
 6693   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6694   //
 6695   // we assume that parsed and condensed are allocated such that for
 6696   // n = (parsedLength + 63) / 64
 6697   // n blocks of 96 bytes of input can be processed, i.e.
 6698   // index + n * 96 <= condensed.length and
 6699   // n * 64 <= parsed.length
 6700   //
 6701   // condensed (byte[]) = c_rarg0
 6702   // condensedIndex = c_rarg1
 6703   // parsed (short[]) = c_rarg2
 6704   // parsedLength = c_rarg3
 6705   address generate_kyber12To16() {
 6706     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6707     int entry_count = StubInfo::entry_count(stub_id);
 6708     assert(entry_count == 1, "sanity check");
 6709     address start = load_archive_data(stub_id);
 6710     if (start != nullptr) {
 6711       return start;
 6712     }
 6713     Label L_F00, L_loop;
 6714 
 6715     __ align(CodeEntryAlignment);
 6716     StubCodeMark mark(this, stub_id);
 6717     start = __ pc();
 6718     __ enter();
 6719 
 6720     const Register condensed = c_rarg0;
 6721     const Register condensedOffs = c_rarg1;
 6722     const Register parsed = c_rarg2;
 6723     const Register parsedLength = c_rarg3;
 6724 
 6725     const Register tmpAddr = r11;
 6726 
 6727     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6728     // quadwords so we need a 6 vector sequence for the inputs.
 6729     // Parsing produces 64 shorts, employing two 8 vector
 6730     // sequences to store and combine the intermediate data.
 6731     VSeq<6> vin(24);
 6732     VSeq<8> va(0), vb(16);
 6733 
 6734     __ adr(tmpAddr, L_F00);
 6735     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6736     __ add(condensed, condensed, condensedOffs);
 6737 
 6738     __ BIND(L_loop);
 6739     // load 96 (6 x 16B) byte values
 6740     vs_ld3_post(vin, __ T16B, condensed);
 6741 
 6742     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6743     // holds 48 (16x3) contiguous bytes from memory striped
 6744     // horizontally across each of the 16 byte lanes. Equivalently,
 6745     // that is 16 pairs of 12-bit integers. Likewise the back half
 6746     // holds the next 48 bytes in the same arrangement.
 6747 
 6748     // Each vector in the front half can also be viewed as a vertical
 6749     // strip across the 16 pairs of 12 bit integers. Each byte in
 6750     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6751     // byte in vin[1] stores the high 4 bits of the first int and the
 6752     // low 4 bits of the second int. Each byte in vin[2] stores the
 6753     // high 8 bits of the second int. Likewise the vectors in second
 6754     // half.
 6755 
 6756     // Converting the data to 16-bit shorts requires first of all
 6757     // expanding each of the 6 x 16B vectors into 6 corresponding
 6758     // pairs of 8H vectors. Mask, shift and add operations on the
 6759     // resulting vector pairs can be used to combine 4 and 8 bit
 6760     // parts of related 8H vector elements.
 6761     //
 6762     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6763     // twice, one copy manipulated to provide the lower 4 bits
 6764     // belonging to the first short in a pair and another copy
 6765     // manipulated to provide the higher 4 bits belonging to the
 6766     // second short in a pair. This is why the vector sequences va
 6767     // and vb are used to hold the expanded 8H elements are of length 8.
 6768 
 6769     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6770     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6771     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6772     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6773     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6774     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6775     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6776     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6777 
 6778     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6779     // and vb[4:5]
 6780     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6781     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6782     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6783     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6784     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6785     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6786 
 6787     // shift lo byte of copy 1 of the middle stripe into the high byte
 6788     __ shl(va[2], __ T8H, va[2], 8);
 6789     __ shl(va[3], __ T8H, va[3], 8);
 6790     __ shl(vb[2], __ T8H, vb[2], 8);
 6791     __ shl(vb[3], __ T8H, vb[3], 8);
 6792 
 6793     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6794     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6795     // are in bit positions [4..11].
 6796     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6797     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6798     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6799     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6800 
 6801     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6802     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6803     // copy2
 6804     __ andr(va[2], __ T16B, va[2], v31);
 6805     __ andr(va[3], __ T16B, va[3], v31);
 6806     __ ushr(va[4], __ T8H, va[4], 4);
 6807     __ ushr(va[5], __ T8H, va[5], 4);
 6808     __ andr(vb[2], __ T16B, vb[2], v31);
 6809     __ andr(vb[3], __ T16B, vb[3], v31);
 6810     __ ushr(vb[4], __ T8H, vb[4], 4);
 6811     __ ushr(vb[5], __ T8H, vb[5], 4);
 6812 
 6813     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6814     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6815     // n.b. the ordering ensures: i) inputs are consumed before they
 6816     // are overwritten ii) the order of 16-bit results across successive
 6817     // pairs of vectors in va and then vb reflects the order of the
 6818     // corresponding 12-bit inputs
 6819     __ addv(va[0], __ T8H, va[0], va[2]);
 6820     __ addv(va[2], __ T8H, va[1], va[3]);
 6821     __ addv(va[1], __ T8H, va[4], va[6]);
 6822     __ addv(va[3], __ T8H, va[5], va[7]);
 6823     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6824     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6825     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6826     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6827 
 6828     // store 64 results interleaved as shorts
 6829     vs_st2_post(vs_front(va), __ T8H, parsed);
 6830     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6831 
 6832     __ sub(parsedLength, parsedLength, 64);
 6833     __ cmp(parsedLength, (u1)0);
 6834     __ br(Assembler::GT, L_loop);
 6835 
 6836     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6837     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6838     __ ret(lr);
 6839 
 6840     // bind label and generate constant data used by this stub
 6841     __ BIND(L_F00);
 6842     __ emit_int64(0x0f000f000f000f00);
 6843     __ emit_int64(0x0f000f000f000f00);
 6844 
 6845     // record the stub entry and end
 6846     store_archive_data(stub_id, start, __ pc());
 6847 
 6848     return start;
 6849   }
 6850 
 6851   // Kyber Barrett reduce function.
 6852   // Implements
 6853   // static int implKyberBarrettReduce(short[] coeffs) {}
 6854   //
 6855   // coeffs (short[256]) = c_rarg0
 6856   address generate_kyberBarrettReduce() {
 6857     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6858     int entry_count = StubInfo::entry_count(stub_id);
 6859     assert(entry_count == 1, "sanity check");
 6860     address start = load_archive_data(stub_id);
 6861     if (start != nullptr) {
 6862       return start;
 6863     }
 6864     __ align(CodeEntryAlignment);
 6865     StubCodeMark mark(this, stub_id);
 6866     start = __ pc();
 6867     __ enter();
 6868 
 6869     const Register coeffs = c_rarg0;
 6870 
 6871     const Register kyberConsts = r10;
 6872     const Register result = r11;
 6873 
 6874     // As above we process 256 sets of values in total i.e. 32 x
 6875     // 8H quadwords. So, we can load, add and store the data in 3
 6876     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6877     // of 10 or 11 registers. A further constraint is that the
 6878     // mapping needs to skip callee saves. So, we allocate the
 6879     // register sequences using two 8 sequences, two 2 sequences
 6880     // and two single registers.
 6881     VSeq<8> vs1_1(0);
 6882     VSeq<2> vs1_2(16);
 6883     FloatRegister vs1_3 = v28;
 6884     VSeq<8> vs2_1(18);
 6885     VSeq<2> vs2_2(26);
 6886     FloatRegister vs2_3 = v29;
 6887 
 6888     // we also need a pair of corresponding constant sequences
 6889 
 6890     VSeq<8> vc1_1(30, 0);
 6891     VSeq<2> vc1_2(30, 0);
 6892     FloatRegister vc1_3 = v30; // for kyber_q
 6893 
 6894     VSeq<8> vc2_1(31, 0);
 6895     VSeq<2> vc2_2(31, 0);
 6896     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6897 
 6898     __ add(result, coeffs, 0);
 6899     __ lea(kyberConsts,
 6900              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6901 
 6902     // load q and the multiplier for the Barrett reduction
 6903     __ add(kyberConsts, kyberConsts, 16);
 6904     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6905 
 6906     for (int i = 0; i < 3; i++) {
 6907       // load 80 or 88 coefficients
 6908       vs_ldpq_post(vs1_1, coeffs);
 6909       vs_ldpq_post(vs1_2, coeffs);
 6910       if (i < 2) {
 6911         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6912       }
 6913 
 6914       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6915       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6916       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6917       if (i < 2) {
 6918         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6919       }
 6920 
 6921       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6922       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6923       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6924       if (i < 2) {
 6925         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6926       }
 6927 
 6928       // vs1 <- vs1 - vs2 * kyber_q
 6929       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6930       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6931       if (i < 2) {
 6932         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6933       }
 6934 
 6935       vs_stpq_post(vs1_1, result);
 6936       vs_stpq_post(vs1_2, result);
 6937       if (i < 2) {
 6938         __ str(vs1_3, __ Q, __ post(result, 16));
 6939       }
 6940     }
 6941 
 6942     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6943     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6944     __ ret(lr);
 6945 
 6946     // record the stub entry and end
 6947     store_archive_data(stub_id, start, __ pc());
 6948 
 6949     return start;
 6950   }
 6951 
 6952 
 6953   // Dilithium-specific montmul helper routines that generate parallel
 6954   // code for, respectively, a single 4x4s vector sequence montmul or
 6955   // two such multiplies in a row.
 6956 
 6957   // Perform 16 32-bit Montgomery multiplications in parallel
 6958   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6959                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6960     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6961     // It will assert that the register use is valid
 6962     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6963   }
 6964 
 6965   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6966   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6967                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6968     // Schedule two successive 4x4S multiplies via the montmul helper
 6969     // on the front and back halves of va, vb and vc. The helper will
 6970     // assert that the register use has no overlap conflicts on each
 6971     // individual call but we also need to ensure that the necessary
 6972     // disjoint/equality constraints are met across both calls.
 6973 
 6974     // vb, vc, vtmp and vq must be disjoint. va must either be
 6975     // disjoint from all other registers or equal vc
 6976 
 6977     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6978     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6979     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6980 
 6981     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6982     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6983 
 6984     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6985 
 6986     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6987     assert(vs_disjoint(va, vb), "va and vb overlap");
 6988     assert(vs_disjoint(va, vq), "va and vq overlap");
 6989     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6990 
 6991     // We multiply the front and back halves of each sequence 4 at a
 6992     // time because
 6993     //
 6994     // 1) we are currently only able to get 4-way instruction
 6995     // parallelism at best
 6996     //
 6997     // 2) we need registers for the constants in vq and temporary
 6998     // scratch registers to hold intermediate results so vtmp can only
 6999     // be a VSeq<4> which means we only have 4 scratch slots.
 7000 
 7001     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 7002     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 7003   }
 7004 
 7005   // Perform combined montmul then add/sub on 4x4S vectors.
 7006   void dilithium_montmul16_sub_add(
 7007           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 7008           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 7009     // compute a = montmul(a1, c)
 7010     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 7011     // ouptut a1 = a0 - a
 7012     vs_subv(va1, __ T4S, va0, vc);
 7013     //    and a0 = a0 + a
 7014     vs_addv(va0, __ T4S, va0, vc);
 7015   }
 7016 
 7017   // Perform combined add/sub then montmul on 4x4S vectors.
 7018   void dilithium_sub_add_montmul16(
 7019           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 7020           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 7021     // compute c = a0 - a1
 7022     vs_subv(vtmp1, __ T4S, va0, va1);
 7023     // output a0 = a0 + a1
 7024     vs_addv(va0, __ T4S, va0, va1);
 7025     // output a1 = b montmul c
 7026     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 7027   }
 7028 
 7029   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 7030   // in the Java implementation come in sequences of at least 8, so we
 7031   // can use ldpq to collect the corresponding data into pairs of vector
 7032   // registers.
 7033   // We collect the coefficients corresponding to the 'j+l' indexes into
 7034   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 7035   // then we do the (Montgomery) multiplications by the zetas in parallel
 7036   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 7037   // v0-v7, then do the additions into v24-v31 and the subtractions into
 7038   // v0-v7 and finally save the results back to the coeffs array.
 7039   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 7040     const Register coeffs, const Register zetas) {
 7041     int c1 = 0;
 7042     int c2 = 512;
 7043     int startIncr;
 7044     // don't use callee save registers v8 - v15
 7045     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7046     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7047     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7048     int offsets[4] = { 0, 32, 64, 96 };
 7049 
 7050     for (int level = 0; level < 5; level++) {
 7051       int c1Start = c1;
 7052       int c2Start = c2;
 7053       if (level == 3) {
 7054         offsets[1] = 32;
 7055         offsets[2] = 128;
 7056         offsets[3] = 160;
 7057       } else if (level == 4) {
 7058         offsets[1] = 64;
 7059         offsets[2] = 128;
 7060         offsets[3] = 192;
 7061       }
 7062 
 7063       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 7064       // time at 4 different offsets and multiply them in order by the
 7065       // next set of input values. So we employ indexed load and store
 7066       // pair instructions with arrangement 4S.
 7067       for (int i = 0; i < 4; i++) {
 7068         // reload q and qinv
 7069         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7070         // load 8x4S coefficients via second start pos == c2
 7071         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 7072         // load next 8x4S inputs == b
 7073         vs_ldpq_post(vs2, zetas);
 7074         // compute a == c2 * b mod MONT_Q
 7075         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7076         // load 8x4s coefficients via first start pos == c1
 7077         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7078         // compute a1 =  c1 + a
 7079         vs_addv(vs3, __ T4S, vs1, vs2);
 7080         // compute a2 =  c1 - a
 7081         vs_subv(vs1, __ T4S, vs1, vs2);
 7082         // output a1 and a2
 7083         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7084         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 7085 
 7086         int k = 4 * level + i;
 7087 
 7088         if (k > 7) {
 7089           startIncr = 256;
 7090         } else if (k == 5) {
 7091           startIncr = 384;
 7092         } else {
 7093           startIncr = 128;
 7094         }
 7095 
 7096         c1Start += startIncr;
 7097         c2Start += startIncr;
 7098       }
 7099 
 7100       c2 /= 2;
 7101     }
 7102   }
 7103 
 7104   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 7105   // Implements the method
 7106   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 7107   // of the Java class sun.security.provider
 7108   //
 7109   // coeffs (int[256]) = c_rarg0
 7110   // zetas (int[256]) = c_rarg1
 7111   address generate_dilithiumAlmostNtt() {
 7112     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 7113     int entry_count = StubInfo::entry_count(stub_id);
 7114     assert(entry_count == 1, "sanity check");
 7115     address start = load_archive_data(stub_id);
 7116     if (start != nullptr) {
 7117       return start;
 7118     }
 7119     __ align(CodeEntryAlignment);
 7120     StubCodeMark mark(this, stub_id);
 7121     start = __ pc();
 7122     __ enter();
 7123 
 7124     const Register coeffs = c_rarg0;
 7125     const Register zetas = c_rarg1;
 7126 
 7127     const Register tmpAddr = r9;
 7128     const Register dilithiumConsts = r10;
 7129     const Register result = r11;
 7130     // don't use callee save registers v8 - v15
 7131     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7132     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7133     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7134     int offsets[4] = { 0, 32, 64, 96};
 7135     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7136     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7137     __ add(result, coeffs, 0);
 7138     __ lea(dilithiumConsts,
 7139              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7140 
 7141     // Each level represents one iteration of the outer for loop of the Java version.
 7142 
 7143     // level 0-4
 7144     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 7145 
 7146     // level 5
 7147 
 7148     // At level 5 the coefficients we need to combine with the zetas
 7149     // are grouped in memory in blocks of size 4. So, for both sets of
 7150     // coefficients we load 4 adjacent values at 8 different offsets
 7151     // using an indexed ldr with register variant Q and multiply them
 7152     // in sequence order by the next set of inputs. Likewise we store
 7153     // the results using an indexed str with register variant Q.
 7154     for (int i = 0; i < 1024; i += 256) {
 7155       // reload constants q, qinv each iteration as they get clobbered later
 7156       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7157       // load 32 (8x4S) coefficients via first offsets = c1
 7158       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7159       // load next 32 (8x4S) inputs = b
 7160       vs_ldpq_post(vs2, zetas);
 7161       // a = b montul c1
 7162       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7163       // load 32 (8x4S) coefficients via second offsets = c2
 7164       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 7165       // add/sub with result of multiply
 7166       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 7167       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 7168       // write back new coefficients using same offsets
 7169       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 7170       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 7171     }
 7172 
 7173     // level 6
 7174     // At level 6 the coefficients we need to combine with the zetas
 7175     // are grouped in memory in pairs, the first two being montmul
 7176     // inputs and the second add/sub inputs. We can still implement
 7177     // the montmul+sub+add using 4-way parallelism but only if we
 7178     // combine the coefficients with the zetas 16 at a time. We load 8
 7179     // adjacent values at 4 different offsets using an ld2 load with
 7180     // arrangement 2D. That interleaves the lower and upper halves of
 7181     // each pair of quadwords into successive vector registers. We
 7182     // then need to montmul the 4 even elements of the coefficients
 7183     // register sequence by the zetas in order and then add/sub the 4
 7184     // odd elements of the coefficients register sequence. We use an
 7185     // equivalent st2 operation to store the results back into memory
 7186     // de-interleaved.
 7187     for (int i = 0; i < 1024; i += 128) {
 7188       // reload constants q, qinv each iteration as they get clobbered later
 7189       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7190       // load interleaved 16 (4x2D) coefficients via offsets
 7191       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7192       // load next 16 (4x4S) inputs
 7193       vs_ldpq_post(vs_front(vs2), zetas);
 7194       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7195       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7196                                   vs_front(vs2), vtmp, vq);
 7197       // store interleaved 16 (4x2D) coefficients via offsets
 7198       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7199     }
 7200 
 7201     // level 7
 7202     // At level 7 the coefficients we need to combine with the zetas
 7203     // occur singly with montmul inputs alternating with add/sub
 7204     // inputs. Once again we can use 4-way parallelism to combine 16
 7205     // zetas at a time. However, we have to load 8 adjacent values at
 7206     // 4 different offsets using an ld2 load with arrangement 4S. That
 7207     // interleaves the odd words of each pair into one
 7208     // coefficients vector register and the even words of the pair
 7209     // into the next register. We then need to montmul the 4 even
 7210     // elements of the coefficients register sequence by the zetas in
 7211     // order and then add/sub the 4 odd elements of the coefficients
 7212     // register sequence. We use an equivalent st2 operation to store
 7213     // the results back into memory de-interleaved.
 7214 
 7215     for (int i = 0; i < 1024; i += 128) {
 7216       // reload constants q, qinv each iteration as they get clobbered later
 7217       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7218       // load interleaved 16 (4x4S) coefficients via offsets
 7219       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7220       // load next 16 (4x4S) inputs
 7221       vs_ldpq_post(vs_front(vs2), zetas);
 7222       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7223       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7224                                   vs_front(vs2), vtmp, vq);
 7225       // store interleaved 16 (4x4S) coefficients via offsets
 7226       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7227     }
 7228     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7229     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7230     __ ret(lr);
 7231 
 7232     // record the stub entry and end
 7233     store_archive_data(stub_id, start, __ pc());
 7234 
 7235     return start;
 7236   }
 7237 
 7238   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 7239   // in the Java implementation come in sequences of at least 8, so we
 7240   // can use ldpq to collect the corresponding data into pairs of vector
 7241   // registers
 7242   // We collect the coefficients that correspond to the 'j's into vs1
 7243   // the coefficiets that correspond to the 'j+l's into vs2 then
 7244   // do the additions into vs3 and the subtractions into vs1 then
 7245   // save the result of the additions, load the zetas into vs2
 7246   // do the (Montgomery) multiplications by zeta in parallel into vs2
 7247   // finally save the results back to the coeffs array
 7248   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 7249     const Register coeffs, const Register zetas) {
 7250     int c1 = 0;
 7251     int c2 = 32;
 7252     int startIncr;
 7253     int offsets[4];
 7254     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7255     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7256     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7257 
 7258     offsets[0] = 0;
 7259 
 7260     for (int level = 3; level < 8; level++) {
 7261       int c1Start = c1;
 7262       int c2Start = c2;
 7263       if (level == 3) {
 7264         offsets[1] = 64;
 7265         offsets[2] = 128;
 7266         offsets[3] = 192;
 7267       } else if (level == 4) {
 7268         offsets[1] = 32;
 7269         offsets[2] = 128;
 7270         offsets[3] = 160;
 7271       } else {
 7272         offsets[1] = 32;
 7273         offsets[2] = 64;
 7274         offsets[3] = 96;
 7275       }
 7276 
 7277       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 7278       // time at 4 different offsets and multiply them in order by the
 7279       // next set of input values. So we employ indexed load and store
 7280       // pair instructions with arrangement 4S.
 7281       for (int i = 0; i < 4; i++) {
 7282         // load v1 32 (8x4S) coefficients relative to first start index
 7283         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7284         // load v2 32 (8x4S) coefficients relative to second start index
 7285         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 7286         // a0 = v1 + v2 -- n.b. clobbers vqs
 7287         vs_addv(vs3, __ T4S, vs1, vs2);
 7288         // a1 = v1 - v2
 7289         vs_subv(vs1, __ T4S, vs1, vs2);
 7290         // save a1 relative to first start index
 7291         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7292         // load constants q, qinv each iteration as they get clobbered above
 7293         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7294         // load b next 32 (8x4S) inputs
 7295         vs_ldpq_post(vs2, zetas);
 7296         // a = a1 montmul b
 7297         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7298         // save a relative to second start index
 7299         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 7300 
 7301         int k = 4 * level + i;
 7302 
 7303         if (k < 24) {
 7304           startIncr = 256;
 7305         } else if (k == 25) {
 7306           startIncr = 384;
 7307         } else {
 7308           startIncr = 128;
 7309         }
 7310 
 7311         c1Start += startIncr;
 7312         c2Start += startIncr;
 7313       }
 7314 
 7315       c2 *= 2;
 7316     }
 7317   }
 7318 
 7319   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 7320   // Implements the method
 7321   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 7322   // the sun.security.provider.ML_DSA class.
 7323   //
 7324   // coeffs (int[256]) = c_rarg0
 7325   // zetas (int[256]) = c_rarg1
 7326   address generate_dilithiumAlmostInverseNtt() {
 7327     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 7328     int entry_count = StubInfo::entry_count(stub_id);
 7329     assert(entry_count == 1, "sanity check");
 7330     address start = load_archive_data(stub_id);
 7331     if (start != nullptr) {
 7332       return start;
 7333     }
 7334     __ align(CodeEntryAlignment);
 7335     StubCodeMark mark(this, stub_id);
 7336     start = __ pc();
 7337     __ enter();
 7338 
 7339     const Register coeffs = c_rarg0;
 7340     const Register zetas = c_rarg1;
 7341 
 7342     const Register tmpAddr = r9;
 7343     const Register dilithiumConsts = r10;
 7344     const Register result = r11;
 7345     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7346     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 7347     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7348     int offsets[4] = { 0, 32, 64, 96 };
 7349     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7350     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7351 
 7352     __ add(result, coeffs, 0);
 7353     __ lea(dilithiumConsts,
 7354              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7355 
 7356     // Each level represents one iteration of the outer for loop of the Java version
 7357 
 7358     // level 0
 7359     // At level 0 we need to interleave adjacent quartets of
 7360     // coefficients before we multiply and add/sub by the next 16
 7361     // zetas just as we did for level 7 in the multiply code. So we
 7362     // load and store the values using an ld2/st2 with arrangement 4S.
 7363     for (int i = 0; i < 1024; i += 128) {
 7364       // load constants q, qinv
 7365       // n.b. this can be moved out of the loop as they do not get
 7366       // clobbered by first two loops
 7367       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7368       // a0/a1 load interleaved 32 (8x4S) coefficients
 7369       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7370       // b load next 32 (8x4S) inputs
 7371       vs_ldpq_post(vs_front(vs2), zetas);
 7372       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7373       // n.b. second half of vs2 provides temporary register storage
 7374       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7375                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7376       // a0/a1 store interleaved 32 (8x4S) coefficients
 7377       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7378     }
 7379 
 7380     // level 1
 7381     // At level 1 we need to interleave pairs of adjacent pairs of
 7382     // coefficients before we multiply by the next 16 zetas just as we
 7383     // did for level 6 in the multiply code. So we load and store the
 7384     // values an ld2/st2 with arrangement 2D.
 7385     for (int i = 0; i < 1024; i += 128) {
 7386       // a0/a1 load interleaved 32 (8x2D) coefficients
 7387       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7388       // b load next 16 (4x4S) inputs
 7389       vs_ldpq_post(vs_front(vs2), zetas);
 7390       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7391       // n.b. second half of vs2 provides temporary register storage
 7392       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7393                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7394       // a0/a1 store interleaved 32 (8x2D) coefficients
 7395       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7396     }
 7397 
 7398     // level 2
 7399     // At level 2 coefficients come in blocks of 4. So, we load 4
 7400     // adjacent coefficients at 8 distinct offsets for both the first
 7401     // and second coefficient sequences, using an ldr with register
 7402     // variant Q then combine them with next set of 32 zetas. Likewise
 7403     // we store the results using an str with register variant Q.
 7404     for (int i = 0; i < 1024; i += 256) {
 7405       // c0 load 32 (8x4S) coefficients via first offsets
 7406       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7407       // c1 load 32 (8x4S) coefficients via second offsets
 7408       vs_ldr_indexed(vs2, __ Q, coeffs, i, offsets2);
 7409       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 7410       vs_addv(vs3, __ T4S, vs1, vs2);
 7411       // c = c0 - c1
 7412       vs_subv(vs1, __ T4S, vs1, vs2);
 7413       // store a0 32 (8x4S) coefficients via first offsets
 7414       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 7415       // b load 32 (8x4S) next inputs
 7416       vs_ldpq_post(vs2, zetas);
 7417       // reload constants q, qinv -- they were clobbered earlier
 7418       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7419       // compute a1 = b montmul c
 7420       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7421       // store a1 32 (8x4S) coefficients via second offsets
 7422       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 7423     }
 7424 
 7425     // level 3-7
 7426     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 7427 
 7428     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7429     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7430     __ ret(lr);
 7431 
 7432     // record the stub entry and end
 7433     store_archive_data(stub_id, start, __ pc());
 7434 
 7435     return start;
 7436   }
 7437 
 7438   // Dilithium multiply polynomials in the NTT domain.
 7439   // Straightforward implementation of the method
 7440   // static int implDilithiumNttMult(
 7441   //              int[] product, int[] coeffs1, int[] coeffs2) {}
 7442   // of the sun.security.provider.ML_DSA class.
 7443   //
 7444   // result (int[256]) = c_rarg0
 7445   // poly1 (int[256]) = c_rarg1
 7446   // poly2 (int[256]) = c_rarg2
 7447   address generate_dilithiumNttMult() {
 7448     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 7449     int entry_count = StubInfo::entry_count(stub_id);
 7450     assert(entry_count == 1, "sanity check");
 7451     address start = load_archive_data(stub_id);
 7452     if (start != nullptr) {
 7453       return start;
 7454     }
 7455     __ align(CodeEntryAlignment);
 7456     StubCodeMark mark(this, stub_id);
 7457     start = __ pc();
 7458     __ enter();
 7459 
 7460     Label L_loop;
 7461 
 7462     const Register result = c_rarg0;
 7463     const Register poly1 = c_rarg1;
 7464     const Register poly2 = c_rarg2;
 7465 
 7466     const Register dilithiumConsts = r10;
 7467     const Register len = r11;
 7468 
 7469     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7470     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7471     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7472     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 7473 
 7474     __ lea(dilithiumConsts,
 7475              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7476 
 7477     // load constants q, qinv
 7478     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7479     // load constant rSquare into v29
 7480     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 7481 
 7482     __ mov(len, zr);
 7483     __ add(len, len, 1024);
 7484 
 7485     __ BIND(L_loop);
 7486 
 7487     // b load 32 (8x4S) next inputs from poly1
 7488     vs_ldpq_post(vs1, poly1);
 7489     // c load 32 (8x4S) next inputs from poly2
 7490     vs_ldpq_post(vs2, poly2);
 7491     // compute a = b montmul c
 7492     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7493     // compute a = rsquare montmul a
 7494     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 7495     // save a 32 (8x4S) results
 7496     vs_stpq_post(vs2, result);
 7497 
 7498     __ sub(len, len, 128);
 7499     __ cmp(len, (u1)128);
 7500     __ br(Assembler::GE, L_loop);
 7501 
 7502     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7503     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7504     __ ret(lr);
 7505 
 7506     // record the stub entry and end
 7507     store_archive_data(stub_id, start, __ pc());
 7508 
 7509     return start;
 7510   }
 7511 
 7512   // Dilithium Montgomery multiply an array by a constant.
 7513   // A straightforward implementation of the method
 7514   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 7515   // of the sun.security.provider.ML_DSA class
 7516   //
 7517   // coeffs (int[256]) = c_rarg0
 7518   // constant (int) = c_rarg1
 7519   address generate_dilithiumMontMulByConstant() {
 7520     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 7521     int entry_count = StubInfo::entry_count(stub_id);
 7522     assert(entry_count == 1, "sanity check");
 7523     address start = load_archive_data(stub_id);
 7524     if (start != nullptr) {
 7525       return start;
 7526     }
 7527     __ align(CodeEntryAlignment);
 7528     StubCodeMark mark(this, stub_id);
 7529     start = __ pc();
 7530     __ enter();
 7531 
 7532     Label L_loop;
 7533 
 7534     const Register coeffs = c_rarg0;
 7535     const Register constant = c_rarg1;
 7536 
 7537     const Register dilithiumConsts = r10;
 7538     const Register result = r11;
 7539     const Register len = r12;
 7540 
 7541     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7542     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7543     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7544     VSeq<8> vconst(29, 0);             // for montmul by constant
 7545 
 7546     // results track inputs
 7547     __ add(result, coeffs, 0);
 7548     __ lea(dilithiumConsts,
 7549              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7550 
 7551     // load constants q, qinv -- they do not get clobbered by first two loops
 7552     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7553     // copy caller supplied constant across vconst
 7554     __ dup(vconst[0], __ T4S, constant);
 7555     __ mov(len, zr);
 7556     __ add(len, len, 1024);
 7557 
 7558     __ BIND(L_loop);
 7559 
 7560     // load next 32 inputs
 7561     vs_ldpq_post(vs2, coeffs);
 7562     // mont mul by constant
 7563     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 7564     // write next 32 results
 7565     vs_stpq_post(vs2, result);
 7566 
 7567     __ sub(len, len, 128);
 7568     __ cmp(len, (u1)128);
 7569     __ br(Assembler::GE, L_loop);
 7570 
 7571     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7572     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7573     __ ret(lr);
 7574 
 7575     // record the stub entry and end
 7576     store_archive_data(stub_id, start, __ pc());
 7577 
 7578     return start;
 7579   }
 7580 
 7581   // Dilithium decompose poly.
 7582   // Implements the method
 7583   //    static int implDilithiumDecomposePoly(int[] input, int[] lowPart, int[] highPart,
 7584   //                                          int twoGamma2, int multiplier) {
 7585   // of the sun.security.provider.ML_DSA class
 7586   //
 7587   // input (int[256]) = c_rarg0
 7588   // lowPart (int[256]) = c_rarg1
 7589   // highPart (int[256]) = c_rarg2
 7590   // twoGamma2  (int) = c_rarg3
 7591   // multiplier (int) = c_rarg4
 7592   address generate_dilithiumDecomposePoly() {
 7593     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 7594     int entry_count = StubInfo::entry_count(stub_id);
 7595     assert(entry_count == 1, "sanity check");
 7596     address start = load_archive_data(stub_id);
 7597     if (start != nullptr) {
 7598       return start;
 7599     }
 7600     __ align(CodeEntryAlignment);
 7601     StubCodeMark mark(this, stub_id);
 7602     start = __ pc();
 7603     Label L_loop;
 7604 
 7605     const Register input = c_rarg0;
 7606     const Register lowPart = c_rarg1;
 7607     const Register highPart = c_rarg2;
 7608     const Register twoGamma2 = c_rarg3;
 7609     const Register multiplier = c_rarg4;
 7610 
 7611     const Register len = r9;
 7612     const Register dilithiumConsts = r10;
 7613     const Register tmp = r11;
 7614 
 7615     // 6 independent sets of 4x4s values
 7616     VSeq<4> vs1(0), vs2(4), vs3(8);
 7617     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7618 
 7619     // 7 constants for cross-multiplying
 7620     VSeq<4> one(25, 0);
 7621     VSeq<4> qminus1(26, 0);
 7622     VSeq<4> g2(27, 0);
 7623     VSeq<4> twog2(28, 0);
 7624     VSeq<4> mult(29, 0);
 7625     VSeq<4> q(30, 0);
 7626     VSeq<4> qadd(31, 0);
 7627 
 7628     __ enter();
 7629 
 7630     __ lea(dilithiumConsts,
 7631              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7632 
 7633     // save callee-saved registers
 7634     __ stpd(v8, v9, __ pre(sp, -64));
 7635     __ stpd(v10, v11, Address(sp, 16));
 7636     __ stpd(v12, v13, Address(sp, 32));
 7637     __ stpd(v14, v15, Address(sp, 48));
 7638 
 7639     // populate constant registers
 7640     __ mov(tmp, zr);
 7641     __ add(tmp, tmp, 1);
 7642     __ dup(one[0], __ T4S, tmp); // 1
 7643     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7644     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7645     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7646     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7647     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7648     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7649 
 7650     __ mov(len, zr);
 7651     __ add(len, len, 1024);
 7652 
 7653     __ BIND(L_loop);
 7654 
 7655     // load next 4x4S inputs interleaved: rplus --> vs1
 7656     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7657 
 7658     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7659     vs_addv(vtmp, __ T4S, vs1, qadd);
 7660     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7661     vs_mulv(vtmp, __ T4S, vtmp, q);
 7662     vs_subv(vs1, __ T4S, vs1, vtmp);
 7663 
 7664     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7665     vs_sshr(vtmp, __ T4S, vs1, 31);
 7666     vs_andr(vtmp, vtmp, q);
 7667     vs_addv(vs1, __ T4S, vs1, vtmp);
 7668 
 7669     // quotient --> vs2
 7670     // int quotient = (rplus * multiplier) >> 22;
 7671     vs_mulv(vtmp, __ T4S, vs1, mult);
 7672     vs_sshr(vs2, __ T4S, vtmp, 22);
 7673 
 7674     // r0 --> vs3
 7675     // int r0 = rplus - quotient * twoGamma2;
 7676     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7677     vs_subv(vs3, __ T4S, vs1, vtmp);
 7678 
 7679     // mask --> vs4
 7680     // int mask = (twoGamma2 - r0) >> 22;
 7681     vs_subv(vtmp, __ T4S, twog2, vs3);
 7682     vs_sshr(vs4, __ T4S, vtmp, 22);
 7683 
 7684     // r0 -= (mask & twoGamma2);
 7685     vs_andr(vtmp, vs4, twog2);
 7686     vs_subv(vs3, __ T4S, vs3, vtmp);
 7687 
 7688     // quotient += (mask & 1);
 7689     vs_andr(vtmp, vs4, one);
 7690     vs_addv(vs2, __ T4S, vs2, vtmp);
 7691 
 7692     // mask = (twoGamma2 / 2 - r0) >> 31;
 7693     vs_subv(vtmp, __ T4S, g2, vs3);
 7694     vs_sshr(vs4, __ T4S, vtmp, 31);
 7695 
 7696     // r0 -= (mask & twoGamma2);
 7697     vs_andr(vtmp, vs4, twog2);
 7698     vs_subv(vs3, __ T4S, vs3, vtmp);
 7699 
 7700     // quotient += (mask & 1);
 7701     vs_andr(vtmp, vs4, one);
 7702     vs_addv(vs2, __ T4S, vs2, vtmp);
 7703 
 7704     // r1 --> vs5
 7705     // int r1 = rplus - r0 - (dilithium_q - 1);
 7706     vs_subv(vtmp, __ T4S, vs1, vs3);
 7707     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7708 
 7709     // r1 --> vs1 (overwriting rplus)
 7710     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7711     vs_negr(vtmp, __ T4S, vs5);
 7712     vs_orr(vtmp, vs5, vtmp);
 7713     vs_sshr(vs1, __ T4S, vtmp, 31);
 7714 
 7715     // r0 += ~r1;
 7716     vs_notr(vtmp, vs1);
 7717     vs_addv(vs3, __ T4S, vs3, vtmp);
 7718 
 7719     // r1 = r1 & quotient;
 7720     vs_andr(vs1, vs2, vs1);
 7721 
 7722     // store results interleaved
 7723     // lowPart[m] = r0;
 7724     // highPart[m] = r1;
 7725     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7726     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7727 
 7728     __ sub(len, len, 64);
 7729     __ cmp(len, (u1)64);
 7730     __ br(Assembler::GE, L_loop);
 7731 
 7732     // restore callee-saved vector registers
 7733     __ ldpd(v14, v15, Address(sp, 48));
 7734     __ ldpd(v12, v13, Address(sp, 32));
 7735     __ ldpd(v10, v11, Address(sp, 16));
 7736     __ ldpd(v8, v9, __ post(sp, 64));
 7737 
 7738     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7739     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7740     __ ret(lr);
 7741 
 7742     // record the stub entry and end
 7743     store_archive_data(stub_id, start, __ pc());
 7744 
 7745     return start;
 7746   }
 7747 
 7748   static constexpr int montMulP256Shift1 = 12; // 64 - bits per limb
 7749   static constexpr int montMulP256Shift2 = 52; // bits per limb
 7750   // stack space needed for carry computation
 7751   static constexpr int cDataSize = 6 * BytesPerLong;
 7752   // stack space needed for data computed by the neon side
 7753   static constexpr int mulDataSize = 16 * BytesPerLong;
 7754 
 7755 
 7756   // Subroutine used by the 52 x 52 bit multiplication algorithm in
 7757   // generate_intpoly_montgomeryMult_P256().
 7758   // This function computes partial results of eight 52 x 52 bit multiplications,
 7759   // where the multiplicands are stored as 64-bit values, specifically
 7760   // (b_0, b_1, b_2, b_3) * (a_3, a_4). (The 4 calls to this function
 7761   // together provide the results of these limb-multiplications.)
 7762   // Calls to this function accept either the low 32 bits or high 20 bits
 7763   // of each b_i packed into bs in ascending order. a_3 and a_4 are packed
 7764   // into successive 64 bit elements of as. lane selects the low 32 or high
 7765   // 20 bits of each a_j value. So four calls with the appropriate parameters
 7766   // will produce the 64-bit low32 * low32, low32 * high20, high20 * low32,
 7767   // high20 * high20 values in the output register sequences vs. The
 7768   // 64-bit partial products are returned in vs in ascending order:
 7769   // vs[0] = (b_0*a_3, b_1*a_3) . . .  vs[3] = (b_2*a_4, b_3*a_4)
 7770 
 7771   void neon_partial_mult_64(const VSeq<4>& vs, FloatRegister bs, FloatRegister as, int lane_lo) {
 7772     __ umullv(vs[0], __ T2D, bs, __ T2S, as, __ S, lane_lo);
 7773     __ umull2v(vs[1], __ T2D, bs, __ T4S, as, __ S, lane_lo);
 7774     __ umullv(vs[2], __ T2D, bs, __ T2S, as, __ S, lane_lo + 2);
 7775     __ umull2v(vs[3], __ T2D, bs, __ T4S, as, __ S, lane_lo + 2);
 7776   }
 7777 
 7778     // Subroutine used by the generate_intpoly_montgomeryMult_P256() function
 7779     // to compute the result of a 52 x 52 bit multiplications where the
 7780     // multiplicands, a and b are available as 64-bit values.
 7781     // The result is going to two 64-bit registers lo (least significant 52 bits)
 7782     // and hi (most significant 52 bits).
 7783     void gpr_partial_mult_52(Register a, Register b, Register hi, Register lo,
 7784      Register mask) {
 7785       // compute 104-bit (40 + 64) full product
 7786       __ umulh(hi, a, b);
 7787       __ mul(lo, a, b);
 7788       // combine 40 + 12 bits into hi result
 7789       // on certain implementations of aarch64 (e.g. apple M1) replacing extr()
 7790       // with the following equivalent instruction sequence the performance
 7791       // improves slightly (despite it is two instructions longer and needs
 7792       // an additional register)
 7793       //      __ lsl(hi, hi, montMulP256Shift1);
 7794       //      __ lsr(tmp, lo, montMulP256Shift2);
 7795       //      __ orr(hi, hi, tmp);
 7796       __ extr(hi, hi, lo, montMulP256Shift2);
 7797       // mask off 52 bits of lo result
 7798       __ andr(lo, lo, mask);
 7799     }
 7800 
 7801   // This assembly follows the Java code in MontgomeryIntegerPolynomial256.mult()
 7802   // quite closely. The main difference is that the computations done with the
 7803   // last two limbs of `a` are done using Neon registers. This allows us to take
 7804   // advantage of both the Neon registers and GPRs simultaneously.
 7805   // It is also worth noting that since Neon does not support 64 bit
 7806   // multiplication, we split each 64 bit value into lower and upper halves
 7807   // and use the "schoolbook" multiplication algorithm.
 7808   address generate_intpoly_montgomeryMult_P256() {
 7809     assert(UseIntPolyIntrinsics, "what are we doing here?");
 7810     StubId stub_id = StubId::stubgen_intpoly_montgomeryMult_P256_id;
 7811     int entry_count = StubInfo::entry_count(stub_id);
 7812     assert(entry_count == 1, "sanity check");
 7813     address start = load_archive_data(stub_id);
 7814     if (start != nullptr) {
 7815       return start;
 7816     }
 7817     __ align(CodeEntryAlignment);
 7818     StubCodeMark mark(this, stub_id);
 7819     start = __ pc();
 7820     __ enter();
 7821 
 7822     // Registers that are used throughout entire routine
 7823     const Register a = c_rarg0;
 7824     const Register b = c_rarg1;
 7825     const Register result = c_rarg2;
 7826 
 7827     RegSet regs = RegSet::range(r0, r28) - rscratch1 - rscratch2
 7828       - r16 - r17 - r18_tls - a - b - result;
 7829 
 7830     auto common_regs = regs.begin();
 7831     Register limb_mask = *common_regs++,
 7832       c_ptr = *common_regs++,
 7833       mod_0 = *common_regs++,
 7834       mod_1 = *common_regs++,
 7835       mod_3 = *common_regs++,
 7836       mod_4 = *common_regs++,
 7837       b_0 = *common_regs++,
 7838       b_1 = *common_regs++,
 7839       b_2 = *common_regs++,
 7840       b_3 = *common_regs++,
 7841       b_4 = *common_regs++;
 7842 
 7843     FloatRegSet floatRegs = FloatRegSet::range(v0, v31)
 7844       - FloatRegSet::range(v8, v15)   // Caller saved vectors
 7845       - FloatRegSet::range(v16, v31); // Manually-allocated vectors
 7846 
 7847     auto common_vectors = floatRegs.begin();
 7848     FloatRegister limb_mask_vec = *common_vectors++,
 7849       b_lows = *common_vectors++,
 7850       b_highs = *common_vectors++,
 7851       a_vals = *common_vectors++;
 7852 
 7853     // Push callee saved registers on to the stack
 7854     RegSet callee_saved = RegSet::range(r19, r28);
 7855     __ push(callee_saved, sp);
 7856 
 7857     // Allocate space on the stack for carry values
 7858     __ sub(sp, sp, cDataSize);
 7859     __ mov(c_ptr, sp);
 7860 
 7861     // Calculate (52-bit) limb masks for both gpr and vector registers
 7862     __ mov(limb_mask, -UCONST64(1) >> montMulP256Shift1);
 7863     __ dup(limb_mask_vec, __ T2D, limb_mask);
 7864 
 7865     //Load input arrays and modulus
 7866     Register a_ptr = *common_regs++, mod_ptr = *common_regs++;
 7867      // skip 3 limbs so a_ptr addresses trailing pair {a3, a4}
 7868     __ add(a_ptr, a, 3 * BytesPerLong);
 7869     __ lea(mod_ptr, ExternalAddress((address)_modulus_P256));
 7870     __ ldr(b_0, Address(b));
 7871     __ ldr(b_1, Address(b, BytesPerLong));
 7872     __ ldr(b_2, Address(b, 2 * BytesPerLong));
 7873     __ ldr(b_3, Address(b, 3 * BytesPerLong));
 7874     __ ldr(b_4, Address(b, 4 * BytesPerLong));
 7875     __ ldr(mod_0, __ post(mod_ptr, BytesPerLong));
 7876     __ ldr(mod_1, __ post(mod_ptr, BytesPerLong));
 7877     __ ldr(mod_3, __ post(mod_ptr, BytesPerLong));
 7878     __ ldr(mod_4, mod_ptr);
 7879     __ ld1(a_vals, __ T2D, a_ptr);
 7880     // use an interleaved load to group low 32 bits and high 20 bits
 7881     // of 4 successive b values into two vector registers
 7882     // n.b. these are the same inputs as the ones in b_0 ... b4
 7883     __ ld2(b_lows, b_highs, __ T4S, b);
 7884     common_regs = common_regs.remaining()
 7885       + a_ptr + mod_ptr;
 7886         a_ptr = mod_ptr = noreg;
 7887 
 7888     //Regs used throughout the main "loop", which is partially unrolled here
 7889     Register high = *common_regs++,
 7890       low = *common_regs++,
 7891       mul_ptr = *common_regs++,
 7892       mod_high = *common_regs++,
 7893       mod_low = *common_regs++,
 7894       a_i = *common_regs++,
 7895       c_i = *common_regs++,
 7896       tmp = *common_regs++,
 7897       n = *common_regs++;
 7898 
 7899     // vector sequences used to compute and combine partial products of
 7900     // b_i * a_j for i = {0,1,2,3} j = {3,4}
 7901     VSeq<4> A(16);
 7902     VSeq<4> B(20);
 7903     VSeq<4> C(24);
 7904     VSeq<4> D(28);
 7905 
 7906 
 7907     // neon and gpr computations are interleaved to maximize parallelism
 7908 
 7909     // allocate stack space for the neon results
 7910     __ sub(sp, sp, mulDataSize);
 7911     __ mov(mul_ptr, sp);
 7912 
 7913     // cross-multiply low * low for limbs b0-b3 and a3-a4 in parallel
 7914     neon_partial_mult_64(A, b_lows, a_vals, 0);
 7915 
 7916     // Limb 0
 7917     __ ldr(a_i, __ post(a, BytesPerLong));
 7918     gpr_partial_mult_52(a_i, b_0, high, low, limb_mask);
 7919     __ mov(n, low);
 7920    // __ andr(n, low, limb_mask);
 7921 
 7922     // cross-multiply high * low for limbs b0-b3 and a3-a4 in parallel
 7923     neon_partial_mult_64(B, b_highs, a_vals, 0);
 7924 
 7925     // Limb 0 modulus computation
 7926     // n.b. modulus computation requires multiplying successive
 7927     // limbs of the product by corresponding limbs of the p256
 7928     // prime adding the result to the limb and folding this
 7929     // partial result into a running 256-bit sum in c_i. Limbs
 7930     // of c_i are stored via c_ptr once carries are included.
 7931     // n.b. the mul + add is omitted for limb 2 since the
 7932     // corresponding prime bits are zero.
 7933     gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
 7934     __ add(low, low, mod_low);
 7935     __ add(high, high, mod_high);
 7936     __ lsr(c_i, low, montMulP256Shift2);
 7937     __ add(c_i, c_i, high);
 7938 
 7939     // cross-multiply low * high for limbs b0-b3 and a3-a4 in parallel
 7940     neon_partial_mult_64(C, b_lows, a_vals, 1);
 7941 
 7942     // Limb 1
 7943     gpr_partial_mult_52(a_i, b_1, high, low, limb_mask);
 7944 
 7945     // cross-multiply high * high for limbs b0-b3 and a3-a4 in parallel
 7946     neon_partial_mult_64(D, b_highs, a_vals, 1);
 7947 
 7948     gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
 7949     __ add(low, low, mod_low);
 7950     __ add(high, high, mod_high);
 7951     __ add(c_i, c_i, low);
 7952     __ str(c_i, c_ptr);
 7953     __ mov(c_i, high);
 7954 
 7955     // combine neon 32-bit partial products, regrouping to produce
 7956     // 8*52-bit low products in A and 8*52-bit high products in D
 7957 
 7958     // add low*high/high*low intermediate products before regrouping
 7959     vs_addv(B, __ T2D, B, C); // Store (B+C) in B
 7960 
 7961     // Limb 2
 7962     gpr_partial_mult_52(a_i, b_2, high, low, limb_mask);
 7963     __ add(c_i, c_i, low);
 7964     __ str(c_i, Address(c_ptr, 8));
 7965     __ mov(c_i, high);
 7966 
 7967     // shift high*high (40-bit) product up into 52-bits of output
 7968     vs_shl(D, __ T2D, D, montMulP256Shift1);
 7969 
 7970     // Limb 3
 7971     gpr_partial_mult_52(a_i, b_3, high, low, limb_mask);
 7972 
 7973     // shift high 32 (or 33) bits of intermediate products for addition to D
 7974     vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) >>> 20)
 7975 
 7976     gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
 7977     __ add(low, low, mod_low);
 7978     __ add(high, high, mod_high);
 7979     __ add(c_i, c_i, low);
 7980     __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
 7981     __ mov(c_i, high);
 7982 
 7983     // shift low 32 bits of intermediate product up for masking and addition to A
 7984     vs_shl(B, __ T2D, B, 32);
 7985 
 7986     // Limb 4
 7987     gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
 7988 
 7989     // add high bits of intermediate product into D
 7990     vs_addv(D, __ T2D, D, C);
 7991 
 7992     gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
 7993     __ add(low, low, mod_low);
 7994     __ add(high, high, mod_high);
 7995     __ add(c_i, c_i, low);
 7996     __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
 7997     __ str(high, Address(c_ptr, 4 * BytesPerLong));
 7998 
 7999     // top 12 bits of 32*32 bit product in A need adding into high 52-bit output
 8000     vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
 8001     // Only 20 of the 32 bits now in the top of B should be added into A
 8002     vs_andr(B, B, limb_mask_vec);
 8003     // reduce original 64-bit product to 52-bits
 8004     vs_andr(A, A, limb_mask_vec);
 8005     // add intermediate products to high 52-bit result in D
 8006     vs_addv(D, __ T2D, D, C);
 8007     // add 20/21 bits of intermediate product in top of B into low 52-bit result
 8008     vs_addv(A, __ T2D, A, B);
 8009     // save and then mask off any overflow bit from computing low 52-bit result
 8010     vs_ushr(B, __ T2D, A, montMulP256Shift2);
 8011     vs_andr(A, A, limb_mask_vec);
 8012     // add any remaining carry into the high 52-bit result
 8013     vs_addv(D, __ T2D, D, B);
 8014 
 8015     // the write interleaves the 4 successive pairs of low and
 8016     // high results: (l0, l1), (h0, h1), ... (l6, l7), (h6, h7)
 8017     vs_st1_interleaved(A, D, mul_ptr);
 8018 
 8019     // Free mul_ptr
 8020     common_regs = common_regs.remaining() + mul_ptr;
 8021     mul_ptr = noreg;
 8022 
 8023     /////////////////////////
 8024     // Loop 2 & 3
 8025     /////////////////////////
 8026 
 8027     for (int i = 0; i < 2; i++) {
 8028       // Load a_i and increment by 8 bytes
 8029       __ ldr(a_i, __ post(a, BytesPerLong));
 8030       __ ldr(c_i, c_ptr); //Load prior c_i
 8031 
 8032       // Limb 0
 8033       gpr_partial_mult_52(a_i, b_0, high, low, limb_mask);
 8034       __ add(low, low, c_i);
 8035       __ ldr(c_i, Address(c_ptr, BytesPerLong));
 8036       __ andr(n, low, limb_mask);
 8037       gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
 8038       __ add(low, low, mod_low);
 8039       __ add(high, high, mod_high);
 8040       __ lsr(tmp, low, montMulP256Shift2);
 8041       __ add(c_i, c_i, tmp);
 8042       __ add(c_i, c_i, high);
 8043 
 8044       // Limb 1
 8045       gpr_partial_mult_52(a_i, b_1, high, low, limb_mask);
 8046       gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
 8047       __ ldr(tmp, Address(c_ptr, 2 * BytesPerLong));
 8048       __ add(low, low, mod_low);
 8049       __ add(high, high, mod_high);
 8050       __ add(c_i, c_i, low);
 8051       __ str(c_i, c_ptr);
 8052       __ add(c_i, tmp, high);
 8053 
 8054       // Limb 2
 8055       gpr_partial_mult_52(a_i, b_2, high, low, limb_mask);
 8056       __ ldr(tmp, Address(c_ptr, 3 * BytesPerLong));
 8057       __ add(c_i, c_i, low);
 8058       __ str(c_i, Address(c_ptr, BytesPerLong));
 8059       __ add(c_i, tmp, high);
 8060 
 8061       // Limb 3
 8062       gpr_partial_mult_52(a_i, b_3, high, low, limb_mask);
 8063       gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
 8064       __ ldr(tmp, Address(c_ptr, 4 * BytesPerLong));
 8065       __ add(low, low, mod_low);
 8066       __ add(high, high, mod_high);
 8067       __ add(c_i, c_i, low);
 8068       __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
 8069       __ add(c_i, tmp, high);
 8070 
 8071       // Limb 4
 8072       gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
 8073       gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
 8074       __ add(low, low, mod_low);
 8075       __ add(high, high, mod_high);
 8076       __ add(c_i, c_i, low);
 8077       __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
 8078       __ str(high, Address(c_ptr, 4 * BytesPerLong));
 8079     }
 8080     // Reallocate regs b_0, b_1, b_2 and b_3
 8081         common_regs = common_regs.remaining()
 8082           + b_0 + b_1 + b_2 + b_3;
 8083             b_0 = b_1 = b_2 = b_3 = noreg;
 8084 
 8085     Register low_1 = *common_regs++;
 8086     Register high_1 = *common_regs++;
 8087 
 8088     //////////////////////////////
 8089     // a[3]
 8090     //////////////////////////////
 8091 
 8092     // For a_3 and a_4 we have already computed the cross-products
 8093     // with b_0 ... b_3 and stored them on the stack relative to
 8094     // `mul_ptr` i.e. the current `sp`in the order
 8095     // l(a_3 * b_0), l(a_3 * b_1), h(a_3 * b_0), h(a_3 * b_1),
 8096     // l(a_3 * b_2), l(a_3 * b_3), h(a_3 * b_2), h(a_3 * b_3),
 8097     // l(a_4 * b_0), l(a_4 * b_1), h(a_4 * b_0), h(a_4 * b_1),
 8098     // l(a_4 * b_2), l(a_4 * b_3), h(a_4 * b_2), h(a_4 * b_3),
 8099     // where l(x) is the low 52 bits of x and h(x) is the high 52 bits
 8100 
 8101     __ ldr(low_1, Address(sp));
 8102     __ ldr(high_1, Address(sp, 2 * BytesPerLong));
 8103 
 8104     __ ldr(low, Address(sp, BytesPerLong));
 8105     __ ldr(high, Address(sp, 3 * BytesPerLong));
 8106     __ ldr(a_i, __ post(a, BytesPerLong));
 8107     __ ldr(c_i, c_ptr);
 8108 
 8109     // Limb 0
 8110     __ add(low_1, low_1, c_i);
 8111     __ ldr(c_i, Address(c_ptr, BytesPerLong));
 8112     __ andr(n, low_1, limb_mask);
 8113     gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
 8114     __ add(low_1, low_1, mod_low);
 8115     __ add(high_1, high_1, mod_high);
 8116     __ lsr(tmp, low_1, montMulP256Shift2);
 8117     __ add(c_i, c_i, tmp);
 8118     __ add(c_i, c_i, high_1);
 8119 
 8120     // Limb 1
 8121     __ ldr(low_1, Address(sp, 4 * BytesPerLong));
 8122     __ ldr(high_1, Address(sp, 6 * BytesPerLong));
 8123     gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
 8124     __ ldr(tmp, Address(c_ptr, 2 * BytesPerLong));
 8125     __ andr(mod_low, mod_low, limb_mask);
 8126     __ add(low, low, mod_low);
 8127     __ add(high, high, mod_high);
 8128     __ add(c_i, c_i, low);
 8129     __ str(c_i, c_ptr);
 8130     __ add(c_i, tmp, high);
 8131 
 8132     // Limb 2
 8133     __ ldr(low, Address(sp, 5 * BytesPerLong));
 8134     __ ldr(high, Address(sp, 7 * BytesPerLong));
 8135     __ ldr(tmp, Address(c_ptr, 3 * BytesPerLong));
 8136     __ add(c_i, c_i, low_1);
 8137     __ str(c_i, Address(c_ptr, BytesPerLong));
 8138     __ add(c_i, tmp, high_1);
 8139 
 8140     // Limb 3
 8141     gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
 8142     __ ldr(tmp, Address(c_ptr, 4 * BytesPerLong));
 8143     __ add(low, low, mod_low);
 8144     __ add(high, high, mod_high);
 8145     __ add(c_i, c_i, low);
 8146     __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
 8147     __ add(c_i, tmp, high);
 8148 
 8149     // Limb 4
 8150     __ ldr(low, Address(sp, 8 * BytesPerLong));
 8151     __ ldr(high, Address(sp, 10 * BytesPerLong));
 8152     gpr_partial_mult_52(a_i, b_4, high_1, low_1, limb_mask);
 8153     gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
 8154     __ add(low_1, low_1, mod_low);
 8155     __ add(high_1, high_1, mod_high);
 8156     __ add(c_i, c_i, low_1);
 8157     __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
 8158     __ str(high_1, Address(c_ptr, 4 * BytesPerLong));
 8159 
 8160     //////////////////////////////
 8161     // a[4]
 8162     //////////////////////////////
 8163 
 8164     Register c5 = *common_regs++,
 8165       c6 = *common_regs++,
 8166       c7 = *common_regs++;
 8167 
 8168     __ ldr(a_i, a);
 8169     __ ldr(c_i, c_ptr);
 8170 
 8171     // Limb 0
 8172     __ ldr(low_1, Address(sp, 9 * BytesPerLong));
 8173     __ ldr(high_1, Address(sp, 11 * BytesPerLong));
 8174 
 8175     __ add(low, low, c_i);
 8176     __ ldr(c_i, Address(c_ptr, BytesPerLong));
 8177     __ andr(n, low, limb_mask);
 8178     gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
 8179     __ add(low, low, mod_low);
 8180     __ add(high, high, mod_high);
 8181     __ lsr(tmp, low, montMulP256Shift2);
 8182     __ add(c_i, c_i, tmp);
 8183     __ add(c_i, c_i, high);
 8184 
 8185     __ ldr(low, Address(sp, 12 * BytesPerLong));
 8186     __ ldr(high, Address(sp, 14 * BytesPerLong));
 8187     gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
 8188     __ add(low_1, low_1, mod_low);
 8189     __ add(high_1, high_1, mod_high);
 8190     __ add(c5, c_i, low_1);
 8191     __ ldr(c_i, Address(c_ptr, 2 * BytesPerLong));
 8192     __ lsr(tmp, c5, montMulP256Shift2);
 8193     __ add(c_i, c_i, tmp);
 8194     __ add(c_i, c_i, high_1);
 8195 
 8196     // Limb 2
 8197     __ ldr(low_1, Address(sp, 13 * BytesPerLong));
 8198     __ ldr(high_1, Address(sp, 15 * BytesPerLong));
 8199     __ add(c6, c_i, low);
 8200     __ ldr(c_i, Address(c_ptr, 3 * BytesPerLong));
 8201     __ lsr(tmp, c6, montMulP256Shift2);
 8202     __ add(c_i, c_i, tmp);
 8203     __ add(c_i, c_i, high);
 8204 
 8205     // Limb 3
 8206     gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
 8207     __ add(low_1, low_1, mod_low);
 8208     __ add(high_1, high_1, mod_high);
 8209     __ add(c7, c_i, low_1);
 8210     __ ldr(c_i, Address(c_ptr, 4 * BytesPerLong));
 8211     __ lsr(tmp, c7, montMulP256Shift2);
 8212     __ add(c_i, c_i, tmp);
 8213     __ add(c_i, c_i, high_1);
 8214 
 8215     // Limb 4
 8216     gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
 8217     gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
 8218     __ add(low, low, mod_low);
 8219     __ add(high, high, mod_high);
 8220 
 8221     // Reallocate b_4
 8222     common_regs = common_regs.remaining() + b_4;
 8223     b_4 = noreg;
 8224 
 8225     Register c8 = *common_regs++,
 8226       c9 = *common_regs++;
 8227 
 8228     __ add(c8, c_i, low);
 8229     __ lsr(c9, c8, montMulP256Shift2);
 8230     __ add(c9, c9, high);
 8231 
 8232     __ andr(c5, c5, limb_mask);
 8233     __ andr(c6, c6, limb_mask);
 8234     __ andr(c7, c7, limb_mask);
 8235     __ andr(c8, c8, limb_mask);
 8236 
 8237     /////////////////////////////
 8238     // Final carry propagate
 8239     /////////////////////////////
 8240 
 8241     // c0 = c5 - modulus[0];
 8242     // c1 = c6 - modulus[1] + (c0 >> BITS_PER_LIMB);
 8243     // c0 &= LIMB_MASK;
 8244     // c2 = c7 + (c1 >> BITS_PER_LIMB);
 8245     // c1 &= LIMB_MASK;
 8246     // c3 = c8 - modulus[3] + (c2 >> BITS_PER_LIMB);
 8247     // c2 &= LIMB_MASK;
 8248     // c4 = c9 - modulus4] + (c3 >> BITS_PER_LIMB);
 8249     // c3 &= LIMB_MASK;
 8250 
 8251     // Free up all unused regs
 8252     common_regs = common_regs.remaining()
 8253       + c_ptr + low + high + mod_high
 8254       + mod_low + a_i + c_i + n + low_1 + high_1;
 8255         c_ptr = low = high = mod_high
 8256       = mod_low = a_i = c_i = n = low_1 = high_1 = noreg;
 8257 
 8258     Register c0 = *common_regs++,
 8259       c1 = *common_regs++,
 8260       c2 = *common_regs++,
 8261       c3 = *common_regs++,
 8262       c4 = *common_regs++;
 8263 
 8264     __ sub(c0, c5, mod_0);
 8265     __ sub(c1, c6, mod_1);
 8266     __ sub(c3, c8, mod_3);
 8267     __ sub(c4, c9, mod_4);
 8268     __ add(c1, c1, c0, Assembler::ASR, montMulP256Shift2);
 8269     __ andr(c0, c0, limb_mask);
 8270     __ add(c2, c7, c1, Assembler::ASR, montMulP256Shift2);
 8271     __ andr(c1, c1, limb_mask);
 8272     __ add(c3, c3, c2, Assembler::ASR, montMulP256Shift2);
 8273     __ andr(c2, c2, limb_mask);
 8274     __ add(c4, c4, c3, Assembler::ASR, montMulP256Shift2);
 8275     __ andr(c3, c3, limb_mask);
 8276 
 8277     // Final write back
 8278     // mask = c4 >> 63
 8279     // r[0] = ((c5 & mask) | (c0 & ~mask));
 8280     // r[1] = ((c6 & mask) | (c1 & ~mask));
 8281     // r[2] = ((c7 & mask) | (c2 & ~mask));
 8282     // r[3] = ((c8 & mask) | (c3 & ~mask));
 8283     // r[4] = ((c9 & mask) | (c4 & ~mask));
 8284 
 8285     common_regs = common_regs.remaining()
 8286       + mod_0 + mod_1 + mod_3 + mod_4;
 8287         mod_0 = mod_1 = mod_3 = mod_4 = noreg;
 8288 
 8289     Register mask = *common_regs++;
 8290     Register nmask = *common_regs++;
 8291 
 8292     __ asr(mask, c4, 63);
 8293     __ mvn(nmask, mask);
 8294     __ andr(c5, c5, mask);
 8295     __ andr(tmp, c0, nmask);
 8296     __ orr(c5, c5, tmp);
 8297     __ andr(c6, c6, mask);
 8298     __ andr(tmp, c1, nmask);
 8299     __ orr(c6, c6, tmp);
 8300     __ andr(c7, c7, mask);
 8301     __ andr(tmp, c2, nmask);
 8302     __ orr(c7, c7, tmp);
 8303     __ andr(c8, c8, mask);
 8304     __ andr(tmp, c3, nmask);
 8305     __ orr(c8, c8, tmp);
 8306     __ andr(c9, c9, mask);
 8307     __ andr(tmp, c4, nmask);
 8308     __ orr(c9, c9, tmp);
 8309 
 8310     __ str(c5, result);
 8311     __ str(c6, Address(result, BytesPerLong));
 8312     __ str(c7, Address(result, 2 * BytesPerLong));
 8313     __ str(c8, Address(result, 3 * BytesPerLong));
 8314     __ str(c9, Address(result, 4 * BytesPerLong));
 8315 
 8316     // End intrinsic call
 8317     __ add(sp, sp, cDataSize + mulDataSize);
 8318     __ pop(callee_saved, sp);
 8319     __ leave();
 8320     __ mov(r0, zr); // return 0
 8321     __ ret(lr);
 8322 
 8323     // record the stub entry and end
 8324     store_archive_data(stub_id, start, __ pc());
 8325 
 8326     return start;
 8327   }
 8328 
 8329   address generate_intpoly_assign() {
 8330     // KNOWN Lengths:
 8331     //   MontgomeryIntPolynP256:  5 = 4 + 1
 8332     //   IntegerPolynomial1305:   5 = 4 + 1
 8333     //   IntegerPolynomial25519: 10 = 8 + 2
 8334     //   IntegerPolynomialP256:  10 = 8 + 2
 8335     //   Curve25519OrderField:   10 = 8 + 2
 8336     //   Curve25519OrderField:   10 = 8 + 2
 8337     //   P256OrderField:         10 = 8 + 2
 8338     //   IntegerPolynomialP384:  14 = 8 + 4 + 2
 8339     //   P384OrderField:         14 = 8 + 4 + 2
 8340     //   IntegerPolynomial448:   16 = 8 + 8
 8341     //   Curve448OrderField:     16 = 8 + 8
 8342     //   Curve448OrderField:     16 = 8 + 8
 8343     //   IntegerPolynomialP521:  19 = 8 + 8 + 2 + 1
 8344     //   P521OrderField:         19 = 8 + 8 + 2 + 1
 8345     // Special Cases 5, 10, 14, 16, 19
 8346     assert(UseIntPolyIntrinsics, "what are we doing here?");
 8347     StubId stub_id = StubId::stubgen_intpoly_assign_id;
 8348     int entry_count = StubInfo::entry_count(stub_id);
 8349     assert(entry_count == 1, "sanity check");
 8350     address start = load_archive_data(stub_id);
 8351     if (start != nullptr) {
 8352       return start;
 8353     }
 8354 
 8355     __ align(CodeEntryAlignment);
 8356     StubCodeMark mark(this, stub_id);
 8357     start = __ pc();
 8358     __ enter();
 8359 
 8360     // Inputs
 8361     const Register set = c_rarg0;
 8362     const Register aLimbs = c_rarg1;
 8363     const Register bLimbs = c_rarg2;
 8364     const Register length = c_rarg3;
 8365 
 8366     Label L_Length5, L_Length10, L_Length14, L_Length16, L_Length19, L_Default, L_Done;
 8367 
 8368     /*
 8369     int maskValue = -set;
 8370     for (int i = 0; i < a.length; i++) {
 8371         long dummyLimbs = maskValue & (a[i] ^ b[i]);
 8372         a[i] = dummyLimbs ^ a[i];
 8373     }
 8374     */
 8375     Register mask_scalar = r4;
 8376     FloatRegister mask_vec = v0;
 8377 
 8378     __ neg(mask_scalar, set);
 8379     __ dup(mask_vec, __ T2D, mask_scalar);
 8380 
 8381     __ cmp(length, (u1)5);
 8382     __ br(Assembler::EQ, L_Length5);
 8383     __ cmp(length, (u1)10);
 8384     __ br(Assembler::EQ, L_Length10);
 8385     __ cmp(length, (u1)14);
 8386     __ br(Assembler::EQ, L_Length14);
 8387     __ cmp(length, (u1)16);
 8388     __ br(Assembler::EQ, L_Length16);
 8389     __ cmp(length, (u1)19);
 8390     __ br(Assembler::EQ, L_Length19);
 8391     __ b(L_Default);
 8392 
 8393 
 8394     // Length = 5
 8395     // Use 5 GPRs (neon not faster with this few limbs)
 8396     __ BIND(L_Length5);
 8397     {
 8398       Register a0 = r5;
 8399       Register a1 = r6;
 8400       Register a2 = r7;
 8401       Register a3 = r10;
 8402       Register a4 = r11;
 8403       Register b0 = r12;
 8404       Register b1 = r13;
 8405       Register b2 = r14;
 8406       Register b3 = r15;
 8407       Register b4 = r19;
 8408 
 8409       __ push(r19, sp);
 8410 
 8411       __ ldr(a0, aLimbs);
 8412       __ ldr(a1, Address(aLimbs, 1 * BytesPerLong));
 8413       __ ldr(a2, Address(aLimbs, 2 * BytesPerLong));
 8414       __ ldr(a3, Address(aLimbs, 3 * BytesPerLong));
 8415       __ ldr(a4, Address(aLimbs, 4 * BytesPerLong));
 8416 
 8417       __ ldr(b0, bLimbs);
 8418       __ ldr(b1, Address(bLimbs, 1 * BytesPerLong));
 8419       __ ldr(b2, Address(bLimbs, 2 * BytesPerLong));
 8420       __ ldr(b3, Address(bLimbs, 3 * BytesPerLong));
 8421       __ ldr(b4, Address(bLimbs, 4 * BytesPerLong));
 8422 
 8423       __ eor(b0, b0, a0);
 8424       __ eor(b1, b1, a1);
 8425       __ eor(b2, b2, a2);
 8426       __ eor(b3, b3, a3);
 8427       __ eor(b4, b4, a4);
 8428 
 8429       __ andr(b0, b0, mask_scalar);
 8430       __ andr(b1, b1, mask_scalar);
 8431       __ andr(b2, b2, mask_scalar);
 8432       __ andr(b3, b3, mask_scalar);
 8433       __ andr(b4, b4, mask_scalar);
 8434 
 8435       __ eor(a0, a0, b0);
 8436       __ eor(a1, a1, b1);
 8437       __ eor(a2, a2, b2);
 8438       __ eor(a3, a3, b3);
 8439       __ eor(a4, a4, b4);
 8440 
 8441       __ str(a0, aLimbs);
 8442       __ str(a1, Address(aLimbs, 1 * BytesPerLong));
 8443       __ str(a2, Address(aLimbs, 2 * BytesPerLong));
 8444       __ str(a3, Address(aLimbs, 3 * BytesPerLong));
 8445       __ str(a4, Address(aLimbs, 4 * BytesPerLong));
 8446 
 8447       __ pop(r19, sp);
 8448       __ b(L_Done);
 8449     }
 8450 
 8451     // Length = 10
 8452     // Split into 4 neon regs and 2 GPRs
 8453     __ BIND(L_Length10);
 8454     {
 8455       Register a9 = r10;
 8456       Register a10 = r11;
 8457       Register b9 = r12;
 8458       Register b10 = r13;
 8459 
 8460       VSeq<4> a_vec(16);
 8461       VSeq<4> b_vec(20);
 8462 
 8463       __ ldr(a9, Address(aLimbs, 8 * BytesPerLong));
 8464       __ ldr(a10, Address(aLimbs, 9 * BytesPerLong));
 8465       __ ldr(b9, Address(bLimbs, 8 * BytesPerLong));
 8466       __ ldr(b10, Address(bLimbs, 9 * BytesPerLong));
 8467 
 8468       vs_ldpq(a_vec, aLimbs);
 8469 
 8470       __ eor(b9, b9, a9);
 8471       __ eor(b10, b10, a10);
 8472 
 8473       vs_ldpq(b_vec, bLimbs);
 8474 
 8475       __ andr(b9, b9, mask_scalar);
 8476       __ andr(b10, b10, mask_scalar);
 8477 
 8478       vs_eor(b_vec, b_vec, a_vec);
 8479 
 8480       __ eor(a9, a9, b9);
 8481       __ eor(a10, a10, b10);
 8482 
 8483       vs_andr(b_vec, b_vec, mask_vec);
 8484 
 8485       __ str(a9, Address(aLimbs, 8 * BytesPerLong));
 8486       __ str(a10, Address(aLimbs, 9 * BytesPerLong));
 8487 
 8488       vs_eor(a_vec, a_vec, b_vec);
 8489       vs_stpq_post(a_vec, aLimbs);
 8490 
 8491       __ b(L_Done);
 8492     }
 8493 
 8494     // Length = 14
 8495     // Split into 5 neon regs and 4 GPRs
 8496     __ BIND(L_Length14);
 8497     {
 8498       Register a10 = r5;
 8499       Register a11 = r6;
 8500       Register a12 = r7;
 8501       Register a13 = r8;
 8502       Register b10 = r9;
 8503       Register b11 = r10;
 8504       Register b12 = r11;
 8505       Register b13 = r12;
 8506 
 8507       VSeq<5> a_vec(16);
 8508       VSeq<5> b_vec(22);
 8509 
 8510       int offsets[2] = { 0, 32 };
 8511 
 8512       __ ldr(a10, Address(aLimbs, 10 * BytesPerLong));
 8513       __ ldr(a11, Address(aLimbs, 11 * BytesPerLong));
 8514       __ ldr(a12, Address(aLimbs, 12 * BytesPerLong));
 8515       __ ldr(a13, Address(aLimbs, 13 * BytesPerLong));
 8516 
 8517       __ ldr(b10, Address(bLimbs, 10 * BytesPerLong));
 8518       __ ldr(b11, Address(bLimbs, 11 * BytesPerLong));
 8519       __ ldr(b12, Address(bLimbs, 12 * BytesPerLong));
 8520       __ ldr(b13, Address(bLimbs, 13 * BytesPerLong));
 8521 
 8522       __ ld1(a_vec[0], __ T2D, aLimbs);
 8523       vs_ldpq_indexed(vs_tail(a_vec), aLimbs, 16, offsets);
 8524 
 8525       __ eor(b10, b10, a10);
 8526       __ eor(b11, b11, a11);
 8527       __ eor(b12, b12, a12);
 8528       __ eor(b13, b13, a13);
 8529 
 8530       __ ld1(b_vec[0], __ T2D, bLimbs);
 8531       vs_ldpq_indexed(vs_tail(b_vec), bLimbs, 16, offsets);
 8532 
 8533       __ andr(b10, b10, mask_scalar);
 8534       __ andr(b11, b11, mask_scalar);
 8535       __ andr(b12, b12, mask_scalar);
 8536       __ andr(b13, b13, mask_scalar);
 8537 
 8538       vs_eor(b_vec, b_vec, a_vec);
 8539 
 8540       __ eor(a10, a10, b10);
 8541       __ eor(a11, a11, b11);
 8542       __ eor(a12, a12, b12);
 8543       __ eor(a13, a13, b13);
 8544 
 8545       vs_andr(b_vec, b_vec, mask_vec);
 8546 
 8547       __ str(a10, Address(aLimbs, 10 * BytesPerLong));
 8548       __ str(a11, Address(aLimbs, 11 * BytesPerLong));
 8549       __ str(a12, Address(aLimbs, 12 * BytesPerLong));
 8550       __ str(a13, Address(aLimbs, 13 * BytesPerLong));
 8551 
 8552       vs_eor(a_vec, a_vec, b_vec);
 8553 
 8554       __ st1(a_vec[0], __ T2D, aLimbs);
 8555       vs_stpq_indexed(vs_tail(a_vec), aLimbs, 16, offsets);
 8556 
 8557       __ b(L_Done);
 8558     }
 8559 
 8560     // Length = 16
 8561     // Use 8 neon regs
 8562     __ BIND(L_Length16);
 8563     {
 8564       VSeq<8> a_vec(16);
 8565       VSeq<8> b_vec(24);
 8566 
 8567       vs_ldpq(a_vec, aLimbs);
 8568       vs_ldpq(b_vec, bLimbs);
 8569       vs_eor(b_vec, b_vec, a_vec);
 8570       vs_andr(b_vec, b_vec, mask_vec);
 8571       vs_eor(a_vec, a_vec, b_vec);
 8572       vs_stpq_post(a_vec, aLimbs);
 8573 
 8574       __ b(L_Done);
 8575     }
 8576 
 8577     // Length = 19
 8578     // Split into 8 neon regs and 3 GPRs
 8579     __ BIND(L_Length19);
 8580     {
 8581       Register a17 = r10;
 8582       Register a18 = r11;
 8583       Register a19 = r12;
 8584       Register b17 = r13;
 8585       Register b18 = r14;
 8586       Register b19 = r15;
 8587 
 8588       VSeq<8> a_vec(16);
 8589       VSeq<8> b_vec(24);
 8590 
 8591       __ ldr(a17, Address(aLimbs, 16 * BytesPerLong));
 8592       __ ldr(a18, Address(aLimbs, 17 * BytesPerLong));
 8593       __ ldr(a19, Address(aLimbs, 18 * BytesPerLong));
 8594       __ ldr(b17, Address(bLimbs, 16 * BytesPerLong));
 8595       __ ldr(b18, Address(bLimbs, 17 * BytesPerLong));
 8596       __ ldr(b19, Address(bLimbs, 18 * BytesPerLong));
 8597 
 8598       vs_ldpq(a_vec, aLimbs);
 8599 
 8600       __ eor(b17, b17, a17);
 8601       __ eor(b18, b18, a18);
 8602       __ eor(b19, b19, a19);
 8603 
 8604       vs_ldpq(b_vec, bLimbs);
 8605 
 8606       __ andr(b17, b17, mask_scalar);
 8607       __ andr(b18, b18, mask_scalar);
 8608       __ andr(b19, b19, mask_scalar);
 8609 
 8610       vs_eor(b_vec, b_vec, a_vec);
 8611 
 8612       __ eor(a17, a17, b17);
 8613       __ eor(a18, a18, b18);
 8614       __ eor(a19, a19, b19);
 8615 
 8616       vs_andr(b_vec, b_vec, mask_vec);
 8617 
 8618       __ str(a17, Address(aLimbs, 16 * BytesPerLong));
 8619       __ str(a18, Address(aLimbs, 17 * BytesPerLong));
 8620       __ str(a19, Address(aLimbs, 18 * BytesPerLong));
 8621 
 8622       vs_eor(a_vec, a_vec, b_vec);
 8623       vs_stpq_post(a_vec, aLimbs);
 8624 
 8625       __ b(L_Done);
 8626     }
 8627 
 8628     __ BIND(L_Default);
 8629     {
 8630       Register ctr = r5;
 8631       Register a_val = r6;
 8632       Register b_val = r7;
 8633 
 8634       __ mov(ctr, length); // length (the number of limbs) is never 0
 8635 
 8636       Label default_loop;
 8637       __ BIND(default_loop);
 8638 
 8639       __ ldr(a_val, aLimbs);
 8640       __ ldr(b_val, __ post(bLimbs, 8));
 8641       __ eor(b_val, b_val, a_val);
 8642       __ andr(b_val, b_val, mask_scalar);
 8643       __ eor(a_val, a_val, b_val);
 8644       __ str(a_val, __ post(aLimbs, 8));
 8645       __ sub(ctr, ctr, 1);
 8646       __ cmp(ctr, (u1)0);
 8647       __ br(Assembler::NE, default_loop);
 8648     }
 8649 
 8650     __ BIND(L_Done);
 8651     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8652     __ mov(r0, zr); // return 0
 8653     __ ret(lr);
 8654 
 8655     // record the stub entry and end
 8656     store_archive_data(stub_id, start, __ pc());
 8657 
 8658     return start;
 8659   }
 8660 
 8661   /**
 8662    * Arithmetic polynomial multiplication in Curve25519.  The algorithm mimics
 8663    * the version in the IntegerPolynomial25519 class, including the use of all
 8664    * columns (no folding method).
 8665    *
 8666    * Arguments:
 8667    *
 8668    * Inputs:
 8669    *   c_rarg0   - long[] aLimbs
 8670    *   c_rarg1   - long[] bLimbs
 8671    *
 8672    * Output:
 8673    *   c_rarg2   - long[] rLimbs result
 8674    */
 8675   address generate_intpoly_mult_25519() {
 8676     StubId stub_id = StubId::stubgen_intpoly_mult_25519_id;
 8677     int entry_count = StubInfo::entry_count(stub_id);
 8678     assert(entry_count == 1, "sanity check");
 8679     address start = load_archive_data(stub_id);
 8680     if (start != nullptr) {
 8681       return start;
 8682     }
 8683     __ align(CodeEntryAlignment);
 8684     StubCodeMark mark(this, stub_id);
 8685     start = __ pc();
 8686     __ enter();
 8687 
 8688     // Register Map
 8689     const Register aLimbs  = c_rarg0; // r0
 8690     const Register bLimbs  = c_rarg1; // r1
 8691     const Register rLimbs  = c_rarg2; // r2
 8692 
 8693     Register c[]   = {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12};
 8694     Register a     = r13;
 8695     Register b     = r14;
 8696     Register term  = r15;
 8697     Register low   = r16;
 8698     Register high  = r17;
 8699 
 8700     const int32_t limbs      = 5;
 8701     const int32_t bpl        = 51;
 8702     const int32_t rem        = 64 - bpl;
 8703     const int32_t TERM       = 19;
 8704     const int32_t columns    = limbs * 2;
 8705     const uint64_t mask      = (uint64_t) -1 >> rem;
 8706     const uint64_t CARRY_ADD = (uint64_t) 1 << (bpl - 1);
 8707 
 8708     __ mov(term, TERM);
 8709     for (int i = 0; i < columns; i++) {
 8710       __ mov(c[i], zr);
 8711     }
 8712 
 8713     // Perform high/low multiplication with signed 5x51 bit limbs
 8714     for (int i = 0; i < limbs; i++) {
 8715       __ ldr(b, Address(bLimbs, i * 8));
 8716       for (int j = 0; j < limbs; j++) {
 8717         __ ldr(a, Address(aLimbs, j * 8));
 8718         __ smulh(high, a, b);
 8719         __ mul(low, a, b);
 8720         __ extr(high, high, low, bpl);
 8721         __ andr(low, low,  mask);
 8722         __ add(c[i + j], c[i + j], low);
 8723         __ add(c[i + j + 1], c[i + j + 1], high);
 8724       }
 8725     }
 8726 
 8727     for (int i = 0; i < limbs; i++) {
 8728       __ mul(c[i + 5], c[i + 5], term);
 8729       __ add(c[i], c[i], c[i + 5]);
 8730     }
 8731 
 8732     // Carry-add with reduction from high limb
 8733     Register tmp       = low;
 8734     Register carry_add = high;
 8735     __ mov(carry_add, CARRY_ADD);
 8736 
 8737     // Limb 3
 8738     __ add(tmp, c[3], carry_add);
 8739     __ asr(tmp, tmp, bpl);
 8740     __ add(c[4], c[4], tmp);
 8741     __ lsl(tmp, tmp, bpl);
 8742     __ sub(c[3], c[3], tmp);
 8743 
 8744     // Limb 4
 8745     __ add(tmp, c[4], carry_add);
 8746     __ asr(tmp, tmp, bpl);
 8747 
 8748     // Reduce high order limb and fold back into low order limb
 8749     __ mul(term, tmp, term);
 8750     __ add(c[0], c[0], term);
 8751 
 8752     __ lsl(tmp, tmp, bpl);
 8753     __ sub(c[4], c[4], tmp);
 8754 
 8755     // Limbs 0 - 3
 8756     for (int i = 0; i < (limbs - 1); i++) {
 8757       __ add(tmp, c[i], carry_add);
 8758       __ asr(tmp, tmp, bpl);
 8759       __ add(c[i + 1], c[i + 1], tmp);
 8760       __ lsl(tmp, tmp, bpl);
 8761       __ sub(c[i], c[i], tmp);
 8762     }
 8763 
 8764     for (int i = 0; i < limbs; i++) {
 8765       __ str(c[i], Address(rLimbs, i * 8));
 8766     }
 8767 
 8768     __ mov(r0, 0);
 8769     __ leave();   // required for proper stackwalking of RuntimeStub frame
 8770     __ ret(lr);
 8771 
 8772     // record the stub entry and end
 8773     store_archive_data(stub_id, start, __ pc());
 8774 
 8775     return start;
 8776   }
 8777 
 8778   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 8779              Register tmp0, Register tmp1, Register tmp2) {
 8780     __ bic(tmp0, a2, a1); // for a0
 8781     __ bic(tmp1, a3, a2); // for a1
 8782     __ bic(tmp2, a4, a3); // for a2
 8783     __ eor(a2, a2, tmp2);
 8784     __ bic(tmp2, a0, a4); // for a3
 8785     __ eor(a3, a3, tmp2);
 8786     __ bic(tmp2, a1, a0); // for a4
 8787     __ eor(a0, a0, tmp0);
 8788     __ eor(a1, a1, tmp1);
 8789     __ eor(a4, a4, tmp2);
 8790   }
 8791 
 8792   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 8793                         Register a0, Register a1, Register a2, Register a3, Register a4,
 8794                         Register a5, Register a6, Register a7, Register a8, Register a9,
 8795                         Register a10, Register a11, Register a12, Register a13, Register a14,
 8796                         Register a15, Register a16, Register a17, Register a18, Register a19,
 8797                         Register a20, Register a21, Register a22, Register a23, Register a24,
 8798                         Register tmp0, Register tmp1, Register tmp2) {
 8799     __ eor3(tmp1, a4, a9, a14);
 8800     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 8801     __ eor3(tmp2, a1, a6, a11);
 8802     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 8803     __ rax1(tmp2, tmp0, tmp1); // d0
 8804     {
 8805 
 8806       Register tmp3, tmp4;
 8807       if (can_use_fp && can_use_r18) {
 8808         tmp3 = rfp;
 8809         tmp4 = r18_tls;
 8810       } else {
 8811         tmp3 = a4;
 8812         tmp4 = a9;
 8813         __ stp(tmp3, tmp4, __ pre(sp, -16));
 8814       }
 8815 
 8816       __ eor3(tmp3, a0, a5, a10);
 8817       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 8818       __ eor(a0, a0, tmp2);
 8819       __ eor(a5, a5, tmp2);
 8820       __ eor(a10, a10, tmp2);
 8821       __ eor(a15, a15, tmp2);
 8822       __ eor(a20, a20, tmp2); // d0(tmp2)
 8823       __ eor3(tmp3, a2, a7, a12);
 8824       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 8825       __ rax1(tmp3, tmp4, tmp2); // d1
 8826       __ eor(a1, a1, tmp3);
 8827       __ eor(a6, a6, tmp3);
 8828       __ eor(a11, a11, tmp3);
 8829       __ eor(a16, a16, tmp3);
 8830       __ eor(a21, a21, tmp3); // d1(tmp3)
 8831       __ rax1(tmp3, tmp2, tmp0); // d3
 8832       __ eor3(tmp2, a3, a8, a13);
 8833       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 8834       __ eor(a3, a3, tmp3);
 8835       __ eor(a8, a8, tmp3);
 8836       __ eor(a13, a13, tmp3);
 8837       __ eor(a18, a18, tmp3);
 8838       __ eor(a23, a23, tmp3);
 8839       __ rax1(tmp2, tmp1, tmp0); // d2
 8840       __ eor(a2, a2, tmp2);
 8841       __ eor(a7, a7, tmp2);
 8842       __ eor(a12, a12, tmp2);
 8843       __ rax1(tmp0, tmp0, tmp4); // d4
 8844       if (!can_use_fp || !can_use_r18) {
 8845         __ ldp(tmp3, tmp4, __ post(sp, 16));
 8846       }
 8847       __ eor(a17, a17, tmp2);
 8848       __ eor(a22, a22, tmp2);
 8849       __ eor(a4, a4, tmp0);
 8850       __ eor(a9, a9, tmp0);
 8851       __ eor(a14, a14, tmp0);
 8852       __ eor(a19, a19, tmp0);
 8853       __ eor(a24, a24, tmp0);
 8854     }
 8855 
 8856     __ rol(tmp0, a10, 3);
 8857     __ rol(a10, a1, 1);
 8858     __ rol(a1, a6, 44);
 8859     __ rol(a6, a9, 20);
 8860     __ rol(a9, a22, 61);
 8861     __ rol(a22, a14, 39);
 8862     __ rol(a14, a20, 18);
 8863     __ rol(a20, a2, 62);
 8864     __ rol(a2, a12, 43);
 8865     __ rol(a12, a13, 25);
 8866     __ rol(a13, a19, 8) ;
 8867     __ rol(a19, a23, 56);
 8868     __ rol(a23, a15, 41);
 8869     __ rol(a15, a4, 27);
 8870     __ rol(a4, a24, 14);
 8871     __ rol(a24, a21, 2);
 8872     __ rol(a21, a8, 55);
 8873     __ rol(a8, a16, 45);
 8874     __ rol(a16, a5, 36);
 8875     __ rol(a5, a3, 28);
 8876     __ rol(a3, a18, 21);
 8877     __ rol(a18, a17, 15);
 8878     __ rol(a17, a11, 10);
 8879     __ rol(a11, a7, 6);
 8880     __ mov(a7, tmp0);
 8881 
 8882     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 8883     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 8884     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 8885     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 8886     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 8887 
 8888     __ ldr(tmp1, __ post(rc, 8));
 8889     __ eor(a0, a0, tmp1);
 8890 
 8891   }
 8892 
 8893   // Arguments:
 8894   //
 8895   // Inputs:
 8896   //   c_rarg0   - byte[]  source+offset
 8897   //   c_rarg1   - byte[]  SHA.state
 8898   //   c_rarg2   - int     block_size
 8899   //   c_rarg3   - int     offset
 8900   //   c_rarg4   - int     limit
 8901   //
 8902   address generate_sha3_implCompress_gpr(StubId stub_id) {
 8903     bool multi_block;
 8904     switch (stub_id) {
 8905     case StubId::stubgen_sha3_implCompress_id:
 8906       multi_block = false;
 8907       break;
 8908     case StubId::stubgen_sha3_implCompressMB_id:
 8909       multi_block = true;
 8910       break;
 8911     default:
 8912       ShouldNotReachHere();
 8913     }
 8914     int entry_count = StubInfo::entry_count(stub_id);
 8915     assert(entry_count == 1, "sanity check");
 8916     address start = load_archive_data(stub_id);
 8917     if (start != nullptr) {
 8918       return start;
 8919     }
 8920     __ align(CodeEntryAlignment);
 8921     StubCodeMark mark(this, stub_id);
 8922     start = __ pc();
 8923 
 8924     Register buf           = c_rarg0;
 8925     Register state         = c_rarg1;
 8926     Register block_size    = c_rarg2;
 8927     Register ofs           = c_rarg3;
 8928     Register limit         = c_rarg4;
 8929 
 8930     // use r3.r17,r19..r28 to keep a0..a24.
 8931     // a0..a24 are respective locals from SHA3.java
 8932     Register a0 = r25,
 8933              a1 = r26,
 8934              a2 = r27,
 8935              a3 = r3,
 8936              a4 = r4,
 8937              a5 = r5,
 8938              a6 = r6,
 8939              a7 = r7,
 8940              a8 = rscratch1, // r8
 8941              a9 = rscratch2, // r9
 8942              a10 = r10,
 8943              a11 = r11,
 8944              a12 = r12,
 8945              a13 = r13,
 8946              a14 = r14,
 8947              a15 = r15,
 8948              a16 = r16,
 8949              a17 = r17,
 8950              a18 = r28,
 8951              a19 = r19,
 8952              a20 = r20,
 8953              a21 = r21,
 8954              a22 = r22,
 8955              a23 = r23,
 8956              a24 = r24;
 8957 
 8958     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 8959 
 8960     Label sha3_loop, rounds24_preloop, loop_body;
 8961     Label sha3_512_or_sha3_384, shake128;
 8962 
 8963     bool can_use_r18 = false;
 8964 #ifndef R18_RESERVED
 8965     can_use_r18 = true;
 8966 #endif
 8967     bool can_use_fp = !PreserveFramePointer;
 8968 
 8969     __ enter();
 8970 
 8971     // save almost all yet unsaved gpr registers on stack
 8972     __ str(block_size, __ pre(sp, -128));
 8973     if (multi_block) {
 8974       __ stpw(ofs, limit, Address(sp, 8));
 8975     }
 8976     // 8 bytes at sp+16 will be used to keep buf
 8977     __ stp(r19, r20, Address(sp, 32));
 8978     __ stp(r21, r22, Address(sp, 48));
 8979     __ stp(r23, r24, Address(sp, 64));
 8980     __ stp(r25, r26, Address(sp, 80));
 8981     __ stp(r27, r28, Address(sp, 96));
 8982     if (can_use_r18 && can_use_fp) {
 8983       __ stp(r18_tls, state, Address(sp, 112));
 8984     } else {
 8985       __ str(state, Address(sp, 112));
 8986     }
 8987 
 8988     // begin sha3 calculations: loading a0..a24 from state arrary
 8989     __ ldp(a0, a1, state);
 8990     __ ldp(a2, a3, Address(state, 16));
 8991     __ ldp(a4, a5, Address(state, 32));
 8992     __ ldp(a6, a7, Address(state, 48));
 8993     __ ldp(a8, a9, Address(state, 64));
 8994     __ ldp(a10, a11, Address(state, 80));
 8995     __ ldp(a12, a13, Address(state, 96));
 8996     __ ldp(a14, a15, Address(state, 112));
 8997     __ ldp(a16, a17, Address(state, 128));
 8998     __ ldp(a18, a19, Address(state, 144));
 8999     __ ldp(a20, a21, Address(state, 160));
 9000     __ ldp(a22, a23, Address(state, 176));
 9001     __ ldr(a24, Address(state, 192));
 9002 
 9003     __ BIND(sha3_loop);
 9004 
 9005     // load input
 9006     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9007     __ eor(a0, a0, tmp3);
 9008     __ eor(a1, a1, tmp2);
 9009     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9010     __ eor(a2, a2, tmp3);
 9011     __ eor(a3, a3, tmp2);
 9012     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9013     __ eor(a4, a4, tmp3);
 9014     __ eor(a5, a5, tmp2);
 9015     __ ldr(tmp3, __ post(buf, 8));
 9016     __ eor(a6, a6, tmp3);
 9017 
 9018     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 9019     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 9020 
 9021     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9022     __ eor(a7, a7, tmp3);
 9023     __ eor(a8, a8, tmp2);
 9024     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9025     __ eor(a9, a9, tmp3);
 9026     __ eor(a10, a10, tmp2);
 9027     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9028     __ eor(a11, a11, tmp3);
 9029     __ eor(a12, a12, tmp2);
 9030     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9031     __ eor(a13, a13, tmp3);
 9032     __ eor(a14, a14, tmp2);
 9033     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9034     __ eor(a15, a15, tmp3);
 9035     __ eor(a16, a16, tmp2);
 9036 
 9037     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 9038     __ andw(tmp2, block_size, 48);
 9039     __ cbzw(tmp2, rounds24_preloop);
 9040     __ tbnz(block_size, 5, shake128);
 9041     // block_size == 144, bit5 == 0, SHA3-244
 9042     __ ldr(tmp3, __ post(buf, 8));
 9043     __ eor(a17, a17, tmp3);
 9044     __ b(rounds24_preloop);
 9045 
 9046     __ BIND(shake128);
 9047     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9048     __ eor(a17, a17, tmp3);
 9049     __ eor(a18, a18, tmp2);
 9050     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9051     __ eor(a19, a19, tmp3);
 9052     __ eor(a20, a20, tmp2);
 9053     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 9054 
 9055     __ BIND(sha3_512_or_sha3_384);
 9056     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9057     __ eor(a7, a7, tmp3);
 9058     __ eor(a8, a8, tmp2);
 9059     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 9060 
 9061     // SHA3-384
 9062     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9063     __ eor(a9, a9, tmp3);
 9064     __ eor(a10, a10, tmp2);
 9065     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9066     __ eor(a11, a11, tmp3);
 9067     __ eor(a12, a12, tmp2);
 9068 
 9069     __ BIND(rounds24_preloop);
 9070     __ fmovs(v0, 24.0); // float loop counter,
 9071     __ fmovs(v1, 1.0);  // exact representation
 9072 
 9073     __ str(buf, Address(sp, 16));
 9074     __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
 9075 
 9076     __ BIND(loop_body);
 9077     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 9078                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 9079                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 9080                      tmp0, tmp1, tmp2);
 9081     __ fsubs(v0, v0, v1);
 9082     __ fcmps(v0, 0.0);
 9083     __ br(__ NE, loop_body);
 9084 
 9085     if (multi_block) {
 9086       __ ldrw(block_size, sp); // block_size
 9087       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 9088       __ addw(tmp2, tmp2, block_size);
 9089       __ cmpw(tmp2, tmp1);
 9090       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 9091       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 9092       __ br(Assembler::LE, sha3_loop);
 9093       __ movw(c_rarg0, tmp2); // return offset
 9094     }
 9095     if (can_use_fp && can_use_r18) {
 9096       __ ldp(r18_tls, state, Address(sp, 112));
 9097     } else {
 9098       __ ldr(state, Address(sp, 112));
 9099     }
 9100     // save calculated sha3 state
 9101     __ stp(a0, a1, Address(state));
 9102     __ stp(a2, a3, Address(state, 16));
 9103     __ stp(a4, a5, Address(state, 32));
 9104     __ stp(a6, a7, Address(state, 48));
 9105     __ stp(a8, a9, Address(state, 64));
 9106     __ stp(a10, a11, Address(state, 80));
 9107     __ stp(a12, a13, Address(state, 96));
 9108     __ stp(a14, a15, Address(state, 112));
 9109     __ stp(a16, a17, Address(state, 128));
 9110     __ stp(a18, a19, Address(state, 144));
 9111     __ stp(a20, a21, Address(state, 160));
 9112     __ stp(a22, a23, Address(state, 176));
 9113     __ str(a24, Address(state, 192));
 9114 
 9115     // restore required registers from stack
 9116     __ ldp(r19, r20, Address(sp, 32));
 9117     __ ldp(r21, r22, Address(sp, 48));
 9118     __ ldp(r23, r24, Address(sp, 64));
 9119     __ ldp(r25, r26, Address(sp, 80));
 9120     __ ldp(r27, r28, Address(sp, 96));
 9121     if (can_use_fp && can_use_r18) {
 9122       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 9123     } // else no need to recalculate rfp, since it wasn't changed
 9124 
 9125     __ leave();
 9126 
 9127     __ ret(lr);
 9128 
 9129     // record the stub entry and end
 9130     store_archive_data(stub_id, start, __ pc());
 9131 
 9132     return start;
 9133   }
 9134 
 9135   /**
 9136    *  Arguments:
 9137    *
 9138    * Inputs:
 9139    *   c_rarg0   - int crc
 9140    *   c_rarg1   - byte* buf
 9141    *   c_rarg2   - int length
 9142    *
 9143    * Output:
 9144    *       rax   - int crc result
 9145    */
 9146   address generate_updateBytesCRC32() {
 9147     assert(UseCRC32Intrinsics, "what are we doing here?");
 9148     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 9149     int entry_count = StubInfo::entry_count(stub_id);
 9150     assert(entry_count == 1, "sanity check");
 9151     address start = load_archive_data(stub_id);
 9152     if (start != nullptr) {
 9153       return start;
 9154     }
 9155     __ align(CodeEntryAlignment);
 9156     StubCodeMark mark(this, stub_id);
 9157 
 9158     start = __ pc();
 9159 
 9160     const Register crc   = c_rarg0;  // crc
 9161     const Register buf   = c_rarg1;  // source java byte array address
 9162     const Register len   = c_rarg2;  // length
 9163     const Register table0 = c_rarg3; // crc_table address
 9164     const Register table1 = c_rarg4;
 9165     const Register table2 = c_rarg5;
 9166     const Register table3 = c_rarg6;
 9167     const Register tmp3 = c_rarg7;
 9168 
 9169     BLOCK_COMMENT("Entry:");
 9170     __ enter(); // required for proper stackwalking of RuntimeStub frame
 9171 
 9172     __ kernel_crc32(crc, buf, len,
 9173               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 9174 
 9175     __ leave(); // required for proper stackwalking of RuntimeStub frame
 9176     __ ret(lr);
 9177 
 9178     // record the stub entry and end
 9179     store_archive_data(stub_id, start, __ pc());
 9180 
 9181     return start;
 9182   }
 9183 
 9184   /**
 9185    *  Arguments:
 9186    *
 9187    * Inputs:
 9188    *   c_rarg0   - int crc
 9189    *   c_rarg1   - byte* buf
 9190    *   c_rarg2   - int length
 9191    *   c_rarg3   - int* table
 9192    *
 9193    * Output:
 9194    *       r0   - int crc result
 9195    */
 9196   address generate_updateBytesCRC32C() {
 9197     assert(UseCRC32CIntrinsics, "what are we doing here?");
 9198     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 9199     int entry_count = StubInfo::entry_count(stub_id);
 9200     assert(entry_count == 1, "sanity check");
 9201     address start = load_archive_data(stub_id);
 9202     if (start != nullptr) {
 9203       return start;
 9204     }
 9205     __ align(CodeEntryAlignment);
 9206     StubCodeMark mark(this, stub_id);
 9207 
 9208     start = __ pc();
 9209 
 9210     const Register crc   = c_rarg0;  // crc
 9211     const Register buf   = c_rarg1;  // source java byte array address
 9212     const Register len   = c_rarg2;  // length
 9213     const Register table0 = c_rarg3; // crc_table address
 9214     const Register table1 = c_rarg4;
 9215     const Register table2 = c_rarg5;
 9216     const Register table3 = c_rarg6;
 9217     const Register tmp3 = c_rarg7;
 9218 
 9219     BLOCK_COMMENT("Entry:");
 9220     __ enter(); // required for proper stackwalking of RuntimeStub frame
 9221 
 9222     __ kernel_crc32c(crc, buf, len,
 9223               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 9224 
 9225     __ leave(); // required for proper stackwalking of RuntimeStub frame
 9226     __ ret(lr);
 9227 
 9228     // record the stub entry and end
 9229     store_archive_data(stub_id, start, __ pc());
 9230 
 9231     return start;
 9232   }
 9233 
 9234   /***
 9235    *  Arguments:
 9236    *
 9237    *  Inputs:
 9238    *   c_rarg0   - int   adler
 9239    *   c_rarg1   - byte* buff
 9240    *   c_rarg2   - int   len
 9241    *
 9242    * Output:
 9243    *   c_rarg0   - int adler result
 9244    */
 9245   address generate_updateBytesAdler32() {
 9246     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 9247     int entry_count = StubInfo::entry_count(stub_id);
 9248     assert(entry_count == 1, "sanity check");
 9249     address start = load_archive_data(stub_id);
 9250     if (start != nullptr) {
 9251       return start;
 9252     }
 9253     __ align(CodeEntryAlignment);
 9254     StubCodeMark mark(this, stub_id);
 9255     start = __ pc();
 9256 
 9257     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 9258 
 9259     // Aliases
 9260     Register adler  = c_rarg0;
 9261     Register s1     = c_rarg0;
 9262     Register s2     = c_rarg3;
 9263     Register buff   = c_rarg1;
 9264     Register len    = c_rarg2;
 9265     Register nmax  = r4;
 9266     Register base  = r5;
 9267     Register count = r6;
 9268     Register temp0 = rscratch1;
 9269     Register temp1 = rscratch2;
 9270     FloatRegister vbytes = v0;
 9271     FloatRegister vs1acc = v1;
 9272     FloatRegister vs2acc = v2;
 9273     FloatRegister vtable = v3;
 9274 
 9275     // Max number of bytes we can process before having to take the mod
 9276     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 9277     uint64_t BASE = 0xfff1;
 9278     uint64_t NMAX = 0x15B0;
 9279 
 9280     __ mov(base, BASE);
 9281     __ mov(nmax, NMAX);
 9282 
 9283     // Load accumulation coefficients for the upper 16 bits
 9284     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 9285     __ ld1(vtable, __ T16B, Address(temp0));
 9286 
 9287     // s1 is initialized to the lower 16 bits of adler
 9288     // s2 is initialized to the upper 16 bits of adler
 9289     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 9290     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 9291 
 9292     // The pipelined loop needs at least 16 elements for 1 iteration
 9293     // It does check this, but it is more effective to skip to the cleanup loop
 9294     __ cmp(len, (u1)16);
 9295     __ br(Assembler::HS, L_nmax);
 9296     __ cbz(len, L_combine);
 9297 
 9298     __ bind(L_simple_by1_loop);
 9299     __ ldrb(temp0, Address(__ post(buff, 1)));
 9300     __ add(s1, s1, temp0);
 9301     __ add(s2, s2, s1);
 9302     __ subs(len, len, 1);
 9303     __ br(Assembler::HI, L_simple_by1_loop);
 9304 
 9305     // s1 = s1 % BASE
 9306     __ subs(temp0, s1, base);
 9307     __ csel(s1, temp0, s1, Assembler::HS);
 9308 
 9309     // s2 = s2 % BASE
 9310     __ lsr(temp0, s2, 16);
 9311     __ lsl(temp1, temp0, 4);
 9312     __ sub(temp1, temp1, temp0);
 9313     __ add(s2, temp1, s2, ext::uxth);
 9314 
 9315     __ subs(temp0, s2, base);
 9316     __ csel(s2, temp0, s2, Assembler::HS);
 9317 
 9318     __ b(L_combine);
 9319 
 9320     __ bind(L_nmax);
 9321     __ subs(len, len, nmax);
 9322     __ sub(count, nmax, 16);
 9323     __ br(Assembler::LO, L_by16);
 9324 
 9325     __ bind(L_nmax_loop);
 9326 
 9327     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 9328                                       vbytes, vs1acc, vs2acc, vtable);
 9329 
 9330     __ subs(count, count, 16);
 9331     __ br(Assembler::HS, L_nmax_loop);
 9332 
 9333     // s1 = s1 % BASE
 9334     __ lsr(temp0, s1, 16);
 9335     __ lsl(temp1, temp0, 4);
 9336     __ sub(temp1, temp1, temp0);
 9337     __ add(temp1, temp1, s1, ext::uxth);
 9338 
 9339     __ lsr(temp0, temp1, 16);
 9340     __ lsl(s1, temp0, 4);
 9341     __ sub(s1, s1, temp0);
 9342     __ add(s1, s1, temp1, ext:: uxth);
 9343 
 9344     __ subs(temp0, s1, base);
 9345     __ csel(s1, temp0, s1, Assembler::HS);
 9346 
 9347     // s2 = s2 % BASE
 9348     __ lsr(temp0, s2, 16);
 9349     __ lsl(temp1, temp0, 4);
 9350     __ sub(temp1, temp1, temp0);
 9351     __ add(temp1, temp1, s2, ext::uxth);
 9352 
 9353     __ lsr(temp0, temp1, 16);
 9354     __ lsl(s2, temp0, 4);
 9355     __ sub(s2, s2, temp0);
 9356     __ add(s2, s2, temp1, ext:: uxth);
 9357 
 9358     __ subs(temp0, s2, base);
 9359     __ csel(s2, temp0, s2, Assembler::HS);
 9360 
 9361     __ subs(len, len, nmax);
 9362     __ sub(count, nmax, 16);
 9363     __ br(Assembler::HS, L_nmax_loop);
 9364 
 9365     __ bind(L_by16);
 9366     __ adds(len, len, count);
 9367     __ br(Assembler::LO, L_by1);
 9368 
 9369     __ bind(L_by16_loop);
 9370 
 9371     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 9372                                       vbytes, vs1acc, vs2acc, vtable);
 9373 
 9374     __ subs(len, len, 16);
 9375     __ br(Assembler::HS, L_by16_loop);
 9376 
 9377     __ bind(L_by1);
 9378     __ adds(len, len, 15);
 9379     __ br(Assembler::LO, L_do_mod);
 9380 
 9381     __ bind(L_by1_loop);
 9382     __ ldrb(temp0, Address(__ post(buff, 1)));
 9383     __ add(s1, temp0, s1);
 9384     __ add(s2, s2, s1);
 9385     __ subs(len, len, 1);
 9386     __ br(Assembler::HS, L_by1_loop);
 9387 
 9388     __ bind(L_do_mod);
 9389     // s1 = s1 % BASE
 9390     __ lsr(temp0, s1, 16);
 9391     __ lsl(temp1, temp0, 4);
 9392     __ sub(temp1, temp1, temp0);
 9393     __ add(temp1, temp1, s1, ext::uxth);
 9394 
 9395     __ lsr(temp0, temp1, 16);
 9396     __ lsl(s1, temp0, 4);
 9397     __ sub(s1, s1, temp0);
 9398     __ add(s1, s1, temp1, ext:: uxth);
 9399 
 9400     __ subs(temp0, s1, base);
 9401     __ csel(s1, temp0, s1, Assembler::HS);
 9402 
 9403     // s2 = s2 % BASE
 9404     __ lsr(temp0, s2, 16);
 9405     __ lsl(temp1, temp0, 4);
 9406     __ sub(temp1, temp1, temp0);
 9407     __ add(temp1, temp1, s2, ext::uxth);
 9408 
 9409     __ lsr(temp0, temp1, 16);
 9410     __ lsl(s2, temp0, 4);
 9411     __ sub(s2, s2, temp0);
 9412     __ add(s2, s2, temp1, ext:: uxth);
 9413 
 9414     __ subs(temp0, s2, base);
 9415     __ csel(s2, temp0, s2, Assembler::HS);
 9416 
 9417     // Combine lower bits and higher bits
 9418     __ bind(L_combine);
 9419     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 9420 
 9421     __ ret(lr);
 9422 
 9423     // record the stub entry and end
 9424     store_archive_data(stub_id, start, __ pc());
 9425 
 9426     return start;
 9427   }
 9428 
 9429   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 9430           Register temp0, Register temp1, FloatRegister vbytes,
 9431           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 9432     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 9433     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 9434     // In non-vectorized code, we update s1 and s2 as:
 9435     //   s1 <- s1 + b1
 9436     //   s2 <- s2 + s1
 9437     //   s1 <- s1 + b2
 9438     //   s2 <- s2 + b1
 9439     //   ...
 9440     //   s1 <- s1 + b16
 9441     //   s2 <- s2 + s1
 9442     // Putting above assignments together, we have:
 9443     //   s1_new = s1 + b1 + b2 + ... + b16
 9444     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 9445     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 9446     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 9447     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 9448 
 9449     // s2 = s2 + s1 * 16
 9450     __ add(s2, s2, s1, Assembler::LSL, 4);
 9451 
 9452     // vs1acc = b1 + b2 + b3 + ... + b16
 9453     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 9454     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 9455     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 9456     __ uaddlv(vs1acc, __ T16B, vbytes);
 9457     __ uaddlv(vs2acc, __ T8H, vs2acc);
 9458 
 9459     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 9460     __ fmovd(temp0, vs1acc);
 9461     __ fmovd(temp1, vs2acc);
 9462     __ add(s1, s1, temp0);
 9463     __ add(s2, s2, temp1);
 9464   }
 9465 
 9466   /**
 9467    *  Arguments:
 9468    *
 9469    *  Input:
 9470    *    c_rarg0   - x address
 9471    *    c_rarg1   - x length
 9472    *    c_rarg2   - y address
 9473    *    c_rarg3   - y length
 9474    *    c_rarg4   - z address
 9475    */
 9476   address generate_multiplyToLen() {
 9477     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 9478     int entry_count = StubInfo::entry_count(stub_id);
 9479     assert(entry_count == 1, "sanity check");
 9480     address start = load_archive_data(stub_id);
 9481     if (start != nullptr) {
 9482       return start;
 9483     }
 9484     __ align(CodeEntryAlignment);
 9485     StubCodeMark mark(this, stub_id);
 9486 
 9487     start = __ pc();
 9488     const Register x     = r0;
 9489     const Register xlen  = r1;
 9490     const Register y     = r2;
 9491     const Register ylen  = r3;
 9492     const Register z     = r4;
 9493 
 9494     const Register tmp0  = r5;
 9495     const Register tmp1  = r10;
 9496     const Register tmp2  = r11;
 9497     const Register tmp3  = r12;
 9498     const Register tmp4  = r13;
 9499     const Register tmp5  = r14;
 9500     const Register tmp6  = r15;
 9501     const Register tmp7  = r16;
 9502 
 9503     BLOCK_COMMENT("Entry:");
 9504     __ enter(); // required for proper stackwalking of RuntimeStub frame
 9505     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 9506     __ leave(); // required for proper stackwalking of RuntimeStub frame
 9507     __ ret(lr);
 9508 
 9509     // record the stub entry and end
 9510     store_archive_data(stub_id, start, __ pc());
 9511 
 9512     return start;
 9513   }
 9514 
 9515   address generate_squareToLen() {
 9516     // squareToLen algorithm for sizes 1..127 described in java code works
 9517     // faster than multiply_to_len on some CPUs and slower on others, but
 9518     // multiply_to_len shows a bit better overall results
 9519     StubId stub_id = StubId::stubgen_squareToLen_id;
 9520     int entry_count = StubInfo::entry_count(stub_id);
 9521     assert(entry_count == 1, "sanity check");
 9522     address start = load_archive_data(stub_id);
 9523     if (start != nullptr) {
 9524       return start;
 9525     }
 9526     __ align(CodeEntryAlignment);
 9527     StubCodeMark mark(this, stub_id);
 9528     start = __ pc();
 9529 
 9530     const Register x     = r0;
 9531     const Register xlen  = r1;
 9532     const Register z     = r2;
 9533     const Register y     = r4; // == x
 9534     const Register ylen  = r5; // == xlen
 9535 
 9536     const Register tmp0  = r3;
 9537     const Register tmp1  = r10;
 9538     const Register tmp2  = r11;
 9539     const Register tmp3  = r12;
 9540     const Register tmp4  = r13;
 9541     const Register tmp5  = r14;
 9542     const Register tmp6  = r15;
 9543     const Register tmp7  = r16;
 9544 
 9545     RegSet spilled_regs = RegSet::of(y, ylen);
 9546     BLOCK_COMMENT("Entry:");
 9547     __ enter();
 9548     __ push(spilled_regs, sp);
 9549     __ mov(y, x);
 9550     __ mov(ylen, xlen);
 9551     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 9552     __ pop(spilled_regs, sp);
 9553     __ leave();
 9554     __ ret(lr);
 9555 
 9556     // record the stub entry and end
 9557     store_archive_data(stub_id, start, __ pc());
 9558 
 9559     return start;
 9560   }
 9561 
 9562   address generate_mulAdd() {
 9563     StubId stub_id = StubId::stubgen_mulAdd_id;
 9564     int entry_count = StubInfo::entry_count(stub_id);
 9565     assert(entry_count == 1, "sanity check");
 9566     address start = load_archive_data(stub_id);
 9567     if (start != nullptr) {
 9568       return start;
 9569     }
 9570     __ align(CodeEntryAlignment);
 9571     StubCodeMark mark(this, stub_id);
 9572 
 9573     start = __ pc();
 9574 
 9575     const Register out     = r0;
 9576     const Register in      = r1;
 9577     const Register offset  = r2;
 9578     const Register len     = r3;
 9579     const Register k       = r4;
 9580 
 9581     BLOCK_COMMENT("Entry:");
 9582     __ enter();
 9583     __ mul_add(out, in, offset, len, k);
 9584     __ leave();
 9585     __ ret(lr);
 9586 
 9587     // record the stub entry and end
 9588     store_archive_data(stub_id, start, __ pc());
 9589 
 9590     return start;
 9591   }
 9592 
 9593   // Arguments:
 9594   //
 9595   // Input:
 9596   //   c_rarg0   - newArr address
 9597   //   c_rarg1   - oldArr address
 9598   //   c_rarg2   - newIdx
 9599   //   c_rarg3   - shiftCount
 9600   //   c_rarg4   - numIter
 9601   //
 9602   address generate_bigIntegerRightShift() {
 9603     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 9604     int entry_count = StubInfo::entry_count(stub_id);
 9605     assert(entry_count == 1, "sanity check");
 9606     address start = load_archive_data(stub_id);
 9607     if (start != nullptr) {
 9608       return start;
 9609     }
 9610     __ align(CodeEntryAlignment);
 9611     StubCodeMark mark(this, stub_id);
 9612     start = __ pc();
 9613 
 9614     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 9615 
 9616     Register newArr        = c_rarg0;
 9617     Register oldArr        = c_rarg1;
 9618     Register newIdx        = c_rarg2;
 9619     Register shiftCount    = c_rarg3;
 9620     Register numIter       = c_rarg4;
 9621     Register idx           = numIter;
 9622 
 9623     Register newArrCur     = rscratch1;
 9624     Register shiftRevCount = rscratch2;
 9625     Register oldArrCur     = r13;
 9626     Register oldArrNext    = r14;
 9627 
 9628     FloatRegister oldElem0        = v0;
 9629     FloatRegister oldElem1        = v1;
 9630     FloatRegister newElem         = v2;
 9631     FloatRegister shiftVCount     = v3;
 9632     FloatRegister shiftVRevCount  = v4;
 9633 
 9634     __ cbz(idx, Exit);
 9635 
 9636     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 9637 
 9638     // left shift count
 9639     __ movw(shiftRevCount, 32);
 9640     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 9641 
 9642     // numIter too small to allow a 4-words SIMD loop, rolling back
 9643     __ cmp(numIter, (u1)4);
 9644     __ br(Assembler::LT, ShiftThree);
 9645 
 9646     __ dup(shiftVCount,    __ T4S, shiftCount);
 9647     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 9648     __ negr(shiftVCount,   __ T4S, shiftVCount);
 9649 
 9650     __ BIND(ShiftSIMDLoop);
 9651 
 9652     // Calculate the load addresses
 9653     __ sub(idx, idx, 4);
 9654     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 9655     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 9656     __ add(oldArrCur,  oldArrNext, 4);
 9657 
 9658     // Load 4 words and process
 9659     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 9660     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 9661     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 9662     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 9663     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 9664     __ st1(newElem,   __ T4S,  Address(newArrCur));
 9665 
 9666     __ cmp(idx, (u1)4);
 9667     __ br(Assembler::LT, ShiftTwoLoop);
 9668     __ b(ShiftSIMDLoop);
 9669 
 9670     __ BIND(ShiftTwoLoop);
 9671     __ cbz(idx, Exit);
 9672     __ cmp(idx, (u1)1);
 9673     __ br(Assembler::EQ, ShiftOne);
 9674 
 9675     // Calculate the load addresses
 9676     __ sub(idx, idx, 2);
 9677     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 9678     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 9679     __ add(oldArrCur,  oldArrNext, 4);
 9680 
 9681     // Load 2 words and process
 9682     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 9683     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 9684     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 9685     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 9686     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 9687     __ st1(newElem,   __ T2S, Address(newArrCur));
 9688     __ b(ShiftTwoLoop);
 9689 
 9690     __ BIND(ShiftThree);
 9691     __ tbz(idx, 1, ShiftOne);
 9692     __ tbz(idx, 0, ShiftTwo);
 9693     __ ldrw(r10,  Address(oldArr, 12));
 9694     __ ldrw(r11,  Address(oldArr, 8));
 9695     __ lsrvw(r10, r10, shiftCount);
 9696     __ lslvw(r11, r11, shiftRevCount);
 9697     __ orrw(r12,  r10, r11);
 9698     __ strw(r12,  Address(newArr, 8));
 9699 
 9700     __ BIND(ShiftTwo);
 9701     __ ldrw(r10,  Address(oldArr, 8));
 9702     __ ldrw(r11,  Address(oldArr, 4));
 9703     __ lsrvw(r10, r10, shiftCount);
 9704     __ lslvw(r11, r11, shiftRevCount);
 9705     __ orrw(r12,  r10, r11);
 9706     __ strw(r12,  Address(newArr, 4));
 9707 
 9708     __ BIND(ShiftOne);
 9709     __ ldrw(r10,  Address(oldArr, 4));
 9710     __ ldrw(r11,  Address(oldArr));
 9711     __ lsrvw(r10, r10, shiftCount);
 9712     __ lslvw(r11, r11, shiftRevCount);
 9713     __ orrw(r12,  r10, r11);
 9714     __ strw(r12,  Address(newArr));
 9715 
 9716     __ BIND(Exit);
 9717     __ ret(lr);
 9718 
 9719     // record the stub entry and end
 9720     store_archive_data(stub_id, start, __ pc());
 9721 
 9722     return start;
 9723   }
 9724 
 9725   // Arguments:
 9726   //
 9727   // Input:
 9728   //   c_rarg0   - newArr address
 9729   //   c_rarg1   - oldArr address
 9730   //   c_rarg2   - newIdx
 9731   //   c_rarg3   - shiftCount
 9732   //   c_rarg4   - numIter
 9733   //
 9734   address generate_bigIntegerLeftShift() {
 9735     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 9736     int entry_count = StubInfo::entry_count(stub_id);
 9737     assert(entry_count == 1, "sanity check");
 9738     address start = load_archive_data(stub_id);
 9739     if (start != nullptr) {
 9740       return start;
 9741     }
 9742     __ align(CodeEntryAlignment);
 9743     StubCodeMark mark(this, stub_id);
 9744     start = __ pc();
 9745 
 9746     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 9747 
 9748     Register newArr        = c_rarg0;
 9749     Register oldArr        = c_rarg1;
 9750     Register newIdx        = c_rarg2;
 9751     Register shiftCount    = c_rarg3;
 9752     Register numIter       = c_rarg4;
 9753 
 9754     Register shiftRevCount = rscratch1;
 9755     Register oldArrNext    = rscratch2;
 9756 
 9757     FloatRegister oldElem0        = v0;
 9758     FloatRegister oldElem1        = v1;
 9759     FloatRegister newElem         = v2;
 9760     FloatRegister shiftVCount     = v3;
 9761     FloatRegister shiftVRevCount  = v4;
 9762 
 9763     __ cbz(numIter, Exit);
 9764 
 9765     __ add(oldArrNext, oldArr, 4);
 9766     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 9767 
 9768     // right shift count
 9769     __ movw(shiftRevCount, 32);
 9770     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 9771 
 9772     // numIter too small to allow a 4-words SIMD loop, rolling back
 9773     __ cmp(numIter, (u1)4);
 9774     __ br(Assembler::LT, ShiftThree);
 9775 
 9776     __ dup(shiftVCount,     __ T4S, shiftCount);
 9777     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 9778     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 9779 
 9780     __ BIND(ShiftSIMDLoop);
 9781 
 9782     // load 4 words and process
 9783     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 9784     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 9785     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 9786     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 9787     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 9788     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 9789     __ sub(numIter,   numIter, 4);
 9790 
 9791     __ cmp(numIter, (u1)4);
 9792     __ br(Assembler::LT, ShiftTwoLoop);
 9793     __ b(ShiftSIMDLoop);
 9794 
 9795     __ BIND(ShiftTwoLoop);
 9796     __ cbz(numIter, Exit);
 9797     __ cmp(numIter, (u1)1);
 9798     __ br(Assembler::EQ, ShiftOne);
 9799 
 9800     // load 2 words and process
 9801     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 9802     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 9803     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 9804     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 9805     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 9806     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 9807     __ sub(numIter,   numIter, 2);
 9808     __ b(ShiftTwoLoop);
 9809 
 9810     __ BIND(ShiftThree);
 9811     __ ldrw(r10,  __ post(oldArr, 4));
 9812     __ ldrw(r11,  __ post(oldArrNext, 4));
 9813     __ lslvw(r10, r10, shiftCount);
 9814     __ lsrvw(r11, r11, shiftRevCount);
 9815     __ orrw(r12,  r10, r11);
 9816     __ strw(r12,  __ post(newArr, 4));
 9817     __ tbz(numIter, 1, Exit);
 9818     __ tbz(numIter, 0, ShiftOne);
 9819 
 9820     __ BIND(ShiftTwo);
 9821     __ ldrw(r10,  __ post(oldArr, 4));
 9822     __ ldrw(r11,  __ post(oldArrNext, 4));
 9823     __ lslvw(r10, r10, shiftCount);
 9824     __ lsrvw(r11, r11, shiftRevCount);
 9825     __ orrw(r12,  r10, r11);
 9826     __ strw(r12,  __ post(newArr, 4));
 9827 
 9828     __ BIND(ShiftOne);
 9829     __ ldrw(r10,  Address(oldArr));
 9830     __ ldrw(r11,  Address(oldArrNext));
 9831     __ lslvw(r10, r10, shiftCount);
 9832     __ lsrvw(r11, r11, shiftRevCount);
 9833     __ orrw(r12,  r10, r11);
 9834     __ strw(r12,  Address(newArr));
 9835 
 9836     __ BIND(Exit);
 9837     __ ret(lr);
 9838 
 9839     // record the stub entry and end
 9840     store_archive_data(stub_id, start, __ pc());
 9841 
 9842     return start;
 9843   }
 9844 
 9845   address generate_count_positives(address &count_positives_long) {
 9846     StubId stub_id = StubId::stubgen_count_positives_id;
 9847     GrowableArray<address> entries;
 9848     int entry_count = StubInfo::entry_count(stub_id);
 9849     // We have an extra entry for count_positives_long.
 9850     assert(entry_count == 2, "sanity check");
 9851     address start = load_archive_data(stub_id, &entries);
 9852     if (start != nullptr) {
 9853       assert(entries.length() == 1,
 9854              "unexpected extra entry count %d", entries.length());
 9855       count_positives_long = entries.at(0);
 9856       return start;
 9857     }
 9858     const u1 large_loop_size = 64;
 9859     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 9860     int dcache_line = VM_Version::dcache_line_size();
 9861 
 9862     Register ary1 = r1, len = r2, result = r0;
 9863 
 9864     __ align(CodeEntryAlignment);
 9865     StubCodeMark mark(this, stub_id);
 9866 
 9867     address entry = __ pc();
 9868 
 9869     __ enter();
 9870     // precondition: a copy of len is already in result
 9871     // __ mov(result, len);
 9872 
 9873   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 9874         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 9875 
 9876   __ cmp(len, (u1)15);
 9877   __ br(Assembler::GT, LEN_OVER_15);
 9878   // The only case when execution falls into this code is when pointer is near
 9879   // the end of memory page and we have to avoid reading next page
 9880   __ add(ary1, ary1, len);
 9881   __ subs(len, len, 8);
 9882   __ br(Assembler::GT, LEN_OVER_8);
 9883   __ ldr(rscratch2, Address(ary1, -8));
 9884   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 9885   __ lsrv(rscratch2, rscratch2, rscratch1);
 9886   __ tst(rscratch2, UPPER_BIT_MASK);
 9887   __ csel(result, zr, result, Assembler::NE);
 9888   __ leave();
 9889   __ ret(lr);
 9890   __ bind(LEN_OVER_8);
 9891   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 9892   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 9893   __ tst(rscratch2, UPPER_BIT_MASK);
 9894   __ br(Assembler::NE, RET_NO_POP);
 9895   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 9896   __ lsrv(rscratch1, rscratch1, rscratch2);
 9897   __ tst(rscratch1, UPPER_BIT_MASK);
 9898   __ bind(RET_NO_POP);
 9899   __ csel(result, zr, result, Assembler::NE);
 9900   __ leave();
 9901   __ ret(lr);
 9902 
 9903   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 9904   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 9905 
 9906   count_positives_long = __ pc(); // 2nd entry point
 9907   entries.append(count_positives_long);
 9908 
 9909   __ enter();
 9910 
 9911   __ bind(LEN_OVER_15);
 9912     __ push(spilled_regs, sp);
 9913     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 9914     __ cbz(rscratch2, ALIGNED);
 9915     __ ldp(tmp6, tmp1, Address(ary1));
 9916     __ mov(tmp5, 16);
 9917     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 9918     __ add(ary1, ary1, rscratch1);
 9919     __ orr(tmp6, tmp6, tmp1);
 9920     __ tst(tmp6, UPPER_BIT_MASK);
 9921     __ br(Assembler::NE, RET_ADJUST);
 9922     __ sub(len, len, rscratch1);
 9923 
 9924   __ bind(ALIGNED);
 9925     __ cmp(len, large_loop_size);
 9926     __ br(Assembler::LT, CHECK_16);
 9927     // Perform 16-byte load as early return in pre-loop to handle situation
 9928     // when initially aligned large array has negative values at starting bytes,
 9929     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 9930     // slower. Cases with negative bytes further ahead won't be affected that
 9931     // much. In fact, it'll be faster due to early loads, less instructions and
 9932     // less branches in LARGE_LOOP.
 9933     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 9934     __ sub(len, len, 16);
 9935     __ orr(tmp6, tmp6, tmp1);
 9936     __ tst(tmp6, UPPER_BIT_MASK);
 9937     __ br(Assembler::NE, RET_ADJUST_16);
 9938     __ cmp(len, large_loop_size);
 9939     __ br(Assembler::LT, CHECK_16);
 9940 
 9941     if (SoftwarePrefetchHintDistance >= 0
 9942         && SoftwarePrefetchHintDistance >= dcache_line) {
 9943       // initial prefetch
 9944       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 9945     }
 9946   __ bind(LARGE_LOOP);
 9947     if (SoftwarePrefetchHintDistance >= 0) {
 9948       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 9949     }
 9950     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 9951     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 9952     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 9953     // instructions per cycle and have less branches, but this approach disables
 9954     // early return, thus, all 64 bytes are loaded and checked every time.
 9955     __ ldp(tmp2, tmp3, Address(ary1));
 9956     __ ldp(tmp4, tmp5, Address(ary1, 16));
 9957     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 9958     __ ldp(tmp6, tmp1, Address(ary1, 48));
 9959     __ add(ary1, ary1, large_loop_size);
 9960     __ sub(len, len, large_loop_size);
 9961     __ orr(tmp2, tmp2, tmp3);
 9962     __ orr(tmp4, tmp4, tmp5);
 9963     __ orr(rscratch1, rscratch1, rscratch2);
 9964     __ orr(tmp6, tmp6, tmp1);
 9965     __ orr(tmp2, tmp2, tmp4);
 9966     __ orr(rscratch1, rscratch1, tmp6);
 9967     __ orr(tmp2, tmp2, rscratch1);
 9968     __ tst(tmp2, UPPER_BIT_MASK);
 9969     __ br(Assembler::NE, RET_ADJUST_LONG);
 9970     __ cmp(len, large_loop_size);
 9971     __ br(Assembler::GE, LARGE_LOOP);
 9972 
 9973   __ bind(CHECK_16); // small 16-byte load pre-loop
 9974     __ cmp(len, (u1)16);
 9975     __ br(Assembler::LT, POST_LOOP16);
 9976 
 9977   __ bind(LOOP16); // small 16-byte load loop
 9978     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 9979     __ sub(len, len, 16);
 9980     __ orr(tmp2, tmp2, tmp3);
 9981     __ tst(tmp2, UPPER_BIT_MASK);
 9982     __ br(Assembler::NE, RET_ADJUST_16);
 9983     __ cmp(len, (u1)16);
 9984     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 9985 
 9986   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 9987     __ cmp(len, (u1)8);
 9988     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 9989     __ ldr(tmp3, Address(__ post(ary1, 8)));
 9990     __ tst(tmp3, UPPER_BIT_MASK);
 9991     __ br(Assembler::NE, RET_ADJUST);
 9992     __ sub(len, len, 8);
 9993 
 9994   __ bind(POST_LOOP16_LOAD_TAIL);
 9995     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 9996     __ ldr(tmp1, Address(ary1));
 9997     __ mov(tmp2, 64);
 9998     __ sub(tmp4, tmp2, len, __ LSL, 3);
 9999     __ lslv(tmp1, tmp1, tmp4);
10000     __ tst(tmp1, UPPER_BIT_MASK);
10001     __ br(Assembler::NE, RET_ADJUST);
10002     // Fallthrough
10003 
10004   __ bind(RET_LEN);
10005     __ pop(spilled_regs, sp);
10006     __ leave();
10007     __ ret(lr);
10008 
10009     // difference result - len is the count of guaranteed to be
10010     // positive bytes
10011 
10012   __ bind(RET_ADJUST_LONG);
10013     __ add(len, len, (u1)(large_loop_size - 16));
10014   __ bind(RET_ADJUST_16);
10015     __ add(len, len, 16);
10016   __ bind(RET_ADJUST);
10017     __ pop(spilled_regs, sp);
10018     __ leave();
10019     __ sub(result, result, len);
10020     __ ret(lr);
10021 
10022     // record the stub entry and end plus the extra entry
10023     store_archive_data(stub_id, entry, __ pc(), &entries);
10024 
10025     return entry;
10026   }
10027 
10028   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
10029         bool usePrefetch, Label &NOT_EQUAL) {
10030     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10031         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
10032         tmp7 = r12, tmp8 = r13;
10033     Label LOOP;
10034 
10035     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10036     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10037     __ bind(LOOP);
10038     if (usePrefetch) {
10039       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
10040       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
10041     }
10042     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
10043     __ eor(tmp1, tmp1, tmp2);
10044     __ eor(tmp3, tmp3, tmp4);
10045     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
10046     __ orr(tmp1, tmp1, tmp3);
10047     __ cbnz(tmp1, NOT_EQUAL);
10048     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10049     __ eor(tmp5, tmp5, tmp6);
10050     __ eor(tmp7, tmp7, tmp8);
10051     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10052     __ orr(tmp5, tmp5, tmp7);
10053     __ cbnz(tmp5, NOT_EQUAL);
10054     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
10055     __ eor(tmp1, tmp1, tmp2);
10056     __ eor(tmp3, tmp3, tmp4);
10057     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
10058     __ orr(tmp1, tmp1, tmp3);
10059     __ cbnz(tmp1, NOT_EQUAL);
10060     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10061     __ eor(tmp5, tmp5, tmp6);
10062     __ sub(cnt1, cnt1, 8 * wordSize);
10063     __ eor(tmp7, tmp7, tmp8);
10064     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10065     // tmp6 is not used. MacroAssembler::subs is used here (rather than
10066     // cmp) because subs allows an unlimited range of immediate operand.
10067     __ subs(tmp6, cnt1, loopThreshold);
10068     __ orr(tmp5, tmp5, tmp7);
10069     __ cbnz(tmp5, NOT_EQUAL);
10070     __ br(__ GE, LOOP);
10071     // post-loop
10072     __ eor(tmp1, tmp1, tmp2);
10073     __ eor(tmp3, tmp3, tmp4);
10074     __ orr(tmp1, tmp1, tmp3);
10075     __ sub(cnt1, cnt1, 2 * wordSize);
10076     __ cbnz(tmp1, NOT_EQUAL);
10077   }
10078 
10079   void generate_large_array_equals_loop_simd(int loopThreshold,
10080         bool usePrefetch, Label &NOT_EQUAL) {
10081     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10082         tmp2 = rscratch2;
10083     Label LOOP;
10084 
10085     __ bind(LOOP);
10086     if (usePrefetch) {
10087       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
10088       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
10089     }
10090     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
10091     __ sub(cnt1, cnt1, 8 * wordSize);
10092     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
10093     __ subs(tmp1, cnt1, loopThreshold);
10094     __ eor(v0, __ T16B, v0, v4);
10095     __ eor(v1, __ T16B, v1, v5);
10096     __ eor(v2, __ T16B, v2, v6);
10097     __ eor(v3, __ T16B, v3, v7);
10098     __ orr(v0, __ T16B, v0, v1);
10099     __ orr(v1, __ T16B, v2, v3);
10100     __ orr(v0, __ T16B, v0, v1);
10101     __ umov(tmp1, v0, __ D, 0);
10102     __ umov(tmp2, v0, __ D, 1);
10103     __ orr(tmp1, tmp1, tmp2);
10104     __ cbnz(tmp1, NOT_EQUAL);
10105     __ br(__ GE, LOOP);
10106   }
10107 
10108   // a1 = r1 - array1 address
10109   // a2 = r2 - array2 address
10110   // result = r0 - return value. Already contains "false"
10111   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
10112   // r3-r5 are reserved temporary registers
10113   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
10114   address generate_large_array_equals() {
10115     StubId stub_id = StubId::stubgen_large_array_equals_id;
10116     int entry_count = StubInfo::entry_count(stub_id);
10117     assert(entry_count == 1, "sanity check");
10118     address start = load_archive_data(stub_id);
10119     if (start != nullptr) {
10120       return start;
10121     }
10122     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10123         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
10124         tmp7 = r12, tmp8 = r13;
10125     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
10126         SMALL_LOOP, POST_LOOP;
10127     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
10128     // calculate if at least 32 prefetched bytes are used
10129     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
10130     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
10131     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
10132     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
10133         tmp5, tmp6, tmp7, tmp8);
10134 
10135     __ align(CodeEntryAlignment);
10136 
10137     StubCodeMark mark(this, stub_id);
10138 
10139     address entry = __ pc();
10140     __ enter();
10141     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
10142     // also advance pointers to use post-increment instead of pre-increment
10143     __ add(a1, a1, wordSize);
10144     __ add(a2, a2, wordSize);
10145     if (AvoidUnalignedAccesses) {
10146       // both implementations (SIMD/nonSIMD) are using relatively large load
10147       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
10148       // on some CPUs in case of address is not at least 16-byte aligned.
10149       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
10150       // load if needed at least for 1st address and make if 16-byte aligned.
10151       Label ALIGNED16;
10152       __ tbz(a1, 3, ALIGNED16);
10153       __ ldr(tmp1, Address(__ post(a1, wordSize)));
10154       __ ldr(tmp2, Address(__ post(a2, wordSize)));
10155       __ sub(cnt1, cnt1, wordSize);
10156       __ eor(tmp1, tmp1, tmp2);
10157       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
10158       __ bind(ALIGNED16);
10159     }
10160     if (UseSIMDForArrayEquals) {
10161       if (SoftwarePrefetchHintDistance >= 0) {
10162         __ subs(tmp1, cnt1, prefetchLoopThreshold);
10163         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
10164         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
10165             /* prfm = */ true, NOT_EQUAL);
10166         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
10167         __ br(__ LT, TAIL);
10168       }
10169       __ bind(NO_PREFETCH_LARGE_LOOP);
10170       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
10171           /* prfm = */ false, NOT_EQUAL);
10172     } else {
10173       __ push(spilled_regs, sp);
10174       if (SoftwarePrefetchHintDistance >= 0) {
10175         __ subs(tmp1, cnt1, prefetchLoopThreshold);
10176         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
10177         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
10178             /* prfm = */ true, NOT_EQUAL);
10179         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
10180         __ br(__ LT, TAIL);
10181       }
10182       __ bind(NO_PREFETCH_LARGE_LOOP);
10183       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
10184           /* prfm = */ false, NOT_EQUAL);
10185     }
10186     __ bind(TAIL);
10187       __ cbz(cnt1, EQUAL);
10188       __ subs(cnt1, cnt1, wordSize);
10189       __ br(__ LE, POST_LOOP);
10190     __ bind(SMALL_LOOP);
10191       __ ldr(tmp1, Address(__ post(a1, wordSize)));
10192       __ ldr(tmp2, Address(__ post(a2, wordSize)));
10193       __ subs(cnt1, cnt1, wordSize);
10194       __ eor(tmp1, tmp1, tmp2);
10195       __ cbnz(tmp1, NOT_EQUAL);
10196       __ br(__ GT, SMALL_LOOP);
10197     __ bind(POST_LOOP);
10198       __ ldr(tmp1, Address(a1, cnt1));
10199       __ ldr(tmp2, Address(a2, cnt1));
10200       __ eor(tmp1, tmp1, tmp2);
10201       __ cbnz(tmp1, NOT_EQUAL);
10202     __ bind(EQUAL);
10203       __ mov(result, true);
10204     __ bind(NOT_EQUAL);
10205       if (!UseSIMDForArrayEquals) {
10206         __ pop(spilled_regs, sp);
10207       }
10208     __ bind(NOT_EQUAL_NO_POP);
10209     __ leave();
10210     __ ret(lr);
10211 
10212     // record the stub entry and end
10213     store_archive_data(stub_id, entry, __ pc());
10214 
10215     return entry;
10216   }
10217 
10218   // result = r0 - return value. Contains initial hashcode value on entry.
10219   // ary = r1 - array address
10220   // cnt = r2 - elements count
10221   // Clobbers: v0-v13, rscratch1, rscratch2
10222   address generate_large_arrays_hashcode(BasicType eltype) {
10223     StubId stub_id;
10224     switch (eltype) {
10225     case T_BOOLEAN:
10226       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
10227       break;
10228     case T_BYTE:
10229       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
10230       break;
10231     case T_CHAR:
10232       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
10233       break;
10234     case T_SHORT:
10235       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
10236       break;
10237     case T_INT:
10238       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
10239       break;
10240     default:
10241       stub_id = StubId::NO_STUBID;
10242       ShouldNotReachHere();
10243     };
10244     int entry_count = StubInfo::entry_count(stub_id);
10245     assert(entry_count == 1, "sanity check");
10246     address start = load_archive_data(stub_id);
10247     if (start != nullptr) {
10248       return start;
10249     }
10250     const Register result = r0, ary = r1, cnt = r2;
10251     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
10252     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
10253     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
10254     const FloatRegister vpowm = v13;
10255 
10256     ARRAYS_HASHCODE_REGISTERS;
10257 
10258     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
10259 
10260     unsigned int vf; // vectorization factor
10261     bool multiply_by_halves;
10262     Assembler::SIMD_Arrangement load_arrangement;
10263     switch (eltype) {
10264     case T_BOOLEAN:
10265     case T_BYTE:
10266       load_arrangement = Assembler::T8B;
10267       multiply_by_halves = true;
10268       vf = 8;
10269       break;
10270     case T_CHAR:
10271     case T_SHORT:
10272       load_arrangement = Assembler::T8H;
10273       multiply_by_halves = true;
10274       vf = 8;
10275       break;
10276     case T_INT:
10277       load_arrangement = Assembler::T4S;
10278       multiply_by_halves = false;
10279       vf = 4;
10280       break;
10281     default:
10282       ShouldNotReachHere();
10283     }
10284 
10285     // Unroll factor
10286     const unsigned uf = 4;
10287 
10288     // Effective vectorization factor
10289     const unsigned evf = vf * uf;
10290 
10291     __ align(CodeEntryAlignment);
10292 
10293     StubCodeMark mark(this, stub_id);
10294 
10295     address entry = __ pc();
10296     __ enter();
10297 
10298     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
10299     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
10300     // value shouldn't change throughout both loops.
10301     __ movw(rscratch1, intpow(31U, 3));
10302     __ mov(vpow, Assembler::S, 0, rscratch1);
10303     __ movw(rscratch1, intpow(31U, 2));
10304     __ mov(vpow, Assembler::S, 1, rscratch1);
10305     __ movw(rscratch1, intpow(31U, 1));
10306     __ mov(vpow, Assembler::S, 2, rscratch1);
10307     __ movw(rscratch1, intpow(31U, 0));
10308     __ mov(vpow, Assembler::S, 3, rscratch1);
10309 
10310     __ mov(vmul0, Assembler::T16B, 0);
10311     __ mov(vmul0, Assembler::S, 3, result);
10312 
10313     __ andr(rscratch2, cnt, (uf - 1) * vf);
10314     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
10315 
10316     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
10317     __ mov(vpowm, Assembler::S, 0, rscratch1);
10318 
10319     // SMALL LOOP
10320     __ bind(SMALL_LOOP);
10321 
10322     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
10323     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10324     __ subsw(rscratch2, rscratch2, vf);
10325 
10326     if (load_arrangement == Assembler::T8B) {
10327       // Extend 8B to 8H to be able to use vector multiply
10328       // instructions
10329       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
10330       if (is_signed_subword_type(eltype)) {
10331         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10332       } else {
10333         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10334       }
10335     }
10336 
10337     switch (load_arrangement) {
10338     case Assembler::T4S:
10339       __ addv(vmul0, load_arrangement, vmul0, vdata0);
10340       break;
10341     case Assembler::T8B:
10342     case Assembler::T8H:
10343       assert(is_subword_type(eltype), "subword type expected");
10344       if (is_signed_subword_type(eltype)) {
10345         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10346       } else {
10347         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10348       }
10349       break;
10350     default:
10351       __ should_not_reach_here();
10352     }
10353 
10354     // Process the upper half of a vector
10355     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
10356       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10357       if (is_signed_subword_type(eltype)) {
10358         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10359       } else {
10360         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10361       }
10362     }
10363 
10364     __ br(Assembler::HI, SMALL_LOOP);
10365 
10366     // SMALL LOOP'S EPILOQUE
10367     __ lsr(rscratch2, cnt, exact_log2(evf));
10368     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
10369 
10370     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
10371     __ addv(vmul0, Assembler::T4S, vmul0);
10372     __ umov(result, vmul0, Assembler::S, 0);
10373 
10374     // TAIL
10375     __ bind(TAIL);
10376 
10377     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
10378     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
10379     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
10380     __ andr(rscratch2, cnt, vf - 1);
10381     __ bind(TAIL_SHORTCUT);
10382     __ adr(rscratch1, BR_BASE);
10383     // For Cortex-A53 offset is 4 because 2 nops are generated.
10384     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
10385     __ movw(rscratch2, 0x1f);
10386     __ br(rscratch1);
10387 
10388     for (size_t i = 0; i < vf - 1; ++i) {
10389       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
10390                                    eltype);
10391       __ maddw(result, result, rscratch2, rscratch1);
10392       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
10393       // Generate 2nd nop to have 4 instructions per iteration.
10394       if (VM_Version::supports_a53mac()) {
10395         __ nop();
10396       }
10397     }
10398     __ bind(BR_BASE);
10399 
10400     __ leave();
10401     __ ret(lr);
10402 
10403     // LARGE LOOP
10404     __ bind(LARGE_LOOP_PREHEADER);
10405 
10406     __ lsr(rscratch2, cnt, exact_log2(evf));
10407 
10408     if (multiply_by_halves) {
10409       // 31^4 - multiplier between lower and upper parts of a register
10410       __ movw(rscratch1, intpow(31U, vf / 2));
10411       __ mov(vpowm, Assembler::S, 1, rscratch1);
10412       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
10413       __ movw(rscratch1, intpow(31U, evf - vf / 2));
10414       __ mov(vpowm, Assembler::S, 0, rscratch1);
10415     } else {
10416       // 31^16
10417       __ movw(rscratch1, intpow(31U, evf));
10418       __ mov(vpowm, Assembler::S, 0, rscratch1);
10419     }
10420 
10421     __ mov(vmul3, Assembler::T16B, 0);
10422     __ mov(vmul2, Assembler::T16B, 0);
10423     __ mov(vmul1, Assembler::T16B, 0);
10424 
10425     __ bind(LARGE_LOOP);
10426 
10427     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
10428     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
10429     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
10430     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10431 
10432     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
10433            Address(__ post(ary, evf * type2aelembytes(eltype))));
10434 
10435     if (load_arrangement == Assembler::T8B) {
10436       // Extend 8B to 8H to be able to use vector multiply
10437       // instructions
10438       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
10439       if (is_signed_subword_type(eltype)) {
10440         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
10441         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
10442         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
10443         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10444       } else {
10445         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
10446         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
10447         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
10448         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10449       }
10450     }
10451 
10452     switch (load_arrangement) {
10453     case Assembler::T4S:
10454       __ addv(vmul3, load_arrangement, vmul3, vdata3);
10455       __ addv(vmul2, load_arrangement, vmul2, vdata2);
10456       __ addv(vmul1, load_arrangement, vmul1, vdata1);
10457       __ addv(vmul0, load_arrangement, vmul0, vdata0);
10458       break;
10459     case Assembler::T8B:
10460     case Assembler::T8H:
10461       assert(is_subword_type(eltype), "subword type expected");
10462       if (is_signed_subword_type(eltype)) {
10463         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
10464         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
10465         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
10466         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10467       } else {
10468         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
10469         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
10470         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
10471         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10472       }
10473       break;
10474     default:
10475       __ should_not_reach_here();
10476     }
10477 
10478     // Process the upper half of a vector
10479     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
10480       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
10481       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
10482       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
10483       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
10484       if (is_signed_subword_type(eltype)) {
10485         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
10486         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
10487         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
10488         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10489       } else {
10490         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
10491         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
10492         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
10493         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10494       }
10495     }
10496 
10497     __ subsw(rscratch2, rscratch2, 1);
10498     __ br(Assembler::HI, LARGE_LOOP);
10499 
10500     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
10501     __ addv(vmul3, Assembler::T4S, vmul3);
10502     __ umov(result, vmul3, Assembler::S, 0);
10503 
10504     __ mov(rscratch2, intpow(31U, vf));
10505 
10506     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
10507     __ addv(vmul2, Assembler::T4S, vmul2);
10508     __ umov(rscratch1, vmul2, Assembler::S, 0);
10509     __ maddw(result, result, rscratch2, rscratch1);
10510 
10511     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
10512     __ addv(vmul1, Assembler::T4S, vmul1);
10513     __ umov(rscratch1, vmul1, Assembler::S, 0);
10514     __ maddw(result, result, rscratch2, rscratch1);
10515 
10516     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
10517     __ addv(vmul0, Assembler::T4S, vmul0);
10518     __ umov(rscratch1, vmul0, Assembler::S, 0);
10519     __ maddw(result, result, rscratch2, rscratch1);
10520 
10521     __ andr(rscratch2, cnt, vf - 1);
10522     __ cbnz(rscratch2, TAIL_SHORTCUT);
10523 
10524     __ leave();
10525     __ ret(lr);
10526 
10527     // record the stub entry and end
10528     store_archive_data(stub_id, entry, __ pc());
10529 
10530     return entry;
10531   }
10532 
10533   address generate_dsin_dcos(bool isCos) {
10534     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
10535     int entry_count = StubInfo::entry_count(stub_id);
10536     assert(entry_count == 1, "sanity check");
10537     address start = load_archive_data(stub_id);
10538     if (start != nullptr) {
10539       return start;
10540     }
10541     __ align(CodeEntryAlignment);
10542     StubCodeMark mark(this, stub_id);
10543     start = __ pc();
10544     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
10545         (address)StubRoutines::aarch64::_two_over_pi,
10546         (address)StubRoutines::aarch64::_pio2,
10547         (address)StubRoutines::aarch64::_dsin_coef,
10548         (address)StubRoutines::aarch64::_dcos_coef);
10549 
10550     // record the stub entry and end
10551     store_archive_data(stub_id, start, __ pc());
10552 
10553     return start;
10554   }
10555 
10556   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
10557   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
10558       Label &DIFF2) {
10559     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
10560     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
10561 
10562     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
10563     __ ldr(tmpU, Address(__ post(cnt1, 8)));
10564     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
10565     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
10566 
10567     __ fmovd(tmpL, vtmp3);
10568     __ eor(rscratch2, tmp3, tmpL);
10569     __ cbnz(rscratch2, DIFF2);
10570 
10571     __ ldr(tmp3, Address(__ post(cnt1, 8)));
10572     __ umov(tmpL, vtmp3, __ D, 1);
10573     __ eor(rscratch2, tmpU, tmpL);
10574     __ cbnz(rscratch2, DIFF1);
10575 
10576     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
10577     __ ldr(tmpU, Address(__ post(cnt1, 8)));
10578     __ fmovd(tmpL, vtmp);
10579     __ eor(rscratch2, tmp3, tmpL);
10580     __ cbnz(rscratch2, DIFF2);
10581 
10582     __ ldr(tmp3, Address(__ post(cnt1, 8)));
10583     __ umov(tmpL, vtmp, __ D, 1);
10584     __ eor(rscratch2, tmpU, tmpL);
10585     __ cbnz(rscratch2, DIFF1);
10586   }
10587 
10588   // r0  = result
10589   // r1  = str1
10590   // r2  = cnt1
10591   // r3  = str2
10592   // r4  = cnt2
10593   // r10 = tmp1
10594   // r11 = tmp2
10595   address generate_compare_long_string_different_encoding(bool isLU) {
10596     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
10597     int entry_count = StubInfo::entry_count(stub_id);
10598     assert(entry_count == 1, "sanity check");
10599     address start = load_archive_data(stub_id);
10600     if (start != nullptr) {
10601       return start;
10602     }
10603     __ align(CodeEntryAlignment);
10604     StubCodeMark mark(this, stub_id);
10605     address entry = __ pc();
10606     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
10607         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
10608         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
10609     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10610         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
10611     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
10612     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
10613 
10614     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
10615 
10616     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
10617     // cnt2 == amount of characters left to compare
10618     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
10619     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
10620     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
10621     __ add(str2, str2, isLU ? wordSize : wordSize/2);
10622     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
10623     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
10624     __ eor(rscratch2, tmp1, tmp2);
10625     __ mov(rscratch1, tmp2);
10626     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
10627     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
10628              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
10629     __ push(spilled_regs, sp);
10630     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
10631     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
10632 
10633     __ ldr(tmp3, Address(__ post(cnt1, 8)));
10634 
10635     if (SoftwarePrefetchHintDistance >= 0) {
10636       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
10637       __ br(__ LT, NO_PREFETCH);
10638       __ bind(LARGE_LOOP_PREFETCH);
10639         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
10640         __ mov(tmp4, 2);
10641         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
10642         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
10643           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10644           __ subs(tmp4, tmp4, 1);
10645           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
10646           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
10647           __ mov(tmp4, 2);
10648         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
10649           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10650           __ subs(tmp4, tmp4, 1);
10651           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
10652           __ sub(cnt2, cnt2, 64);
10653           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
10654           __ br(__ GE, LARGE_LOOP_PREFETCH);
10655     }
10656     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
10657     __ bind(NO_PREFETCH);
10658     __ subs(cnt2, cnt2, 16);
10659     __ br(__ LT, TAIL);
10660     __ align(OptoLoopAlignment);
10661     __ bind(SMALL_LOOP); // smaller loop
10662       __ subs(cnt2, cnt2, 16);
10663       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10664       __ br(__ GE, SMALL_LOOP);
10665       __ cmn(cnt2, (u1)16);
10666       __ br(__ EQ, LOAD_LAST);
10667     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
10668       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
10669       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
10670       __ ldr(tmp3, Address(cnt1, -8));
10671       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
10672       __ b(LOAD_LAST);
10673     __ bind(DIFF2);
10674       __ mov(tmpU, tmp3);
10675     __ bind(DIFF1);
10676       __ pop(spilled_regs, sp);
10677       __ b(CALCULATE_DIFFERENCE);
10678     __ bind(LOAD_LAST);
10679       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
10680       // No need to load it again
10681       __ mov(tmpU, tmp3);
10682       __ pop(spilled_regs, sp);
10683 
10684       // tmp2 points to the address of the last 4 Latin1 characters right now
10685       __ ldrs(vtmp, Address(tmp2));
10686       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
10687       __ fmovd(tmpL, vtmp);
10688 
10689       __ eor(rscratch2, tmpU, tmpL);
10690       __ cbz(rscratch2, DONE);
10691 
10692     // Find the first different characters in the longwords and
10693     // compute their difference.
10694     __ bind(CALCULATE_DIFFERENCE);
10695       __ rev(rscratch2, rscratch2);
10696       __ clz(rscratch2, rscratch2);
10697       __ andr(rscratch2, rscratch2, -16);
10698       __ lsrv(tmp1, tmp1, rscratch2);
10699       __ uxthw(tmp1, tmp1);
10700       __ lsrv(rscratch1, rscratch1, rscratch2);
10701       __ uxthw(rscratch1, rscratch1);
10702       __ subw(result, tmp1, rscratch1);
10703     __ bind(DONE);
10704       __ ret(lr);
10705 
10706       // record the stub entry and end
10707       store_archive_data(stub_id, entry, __ pc());
10708 
10709       return entry;
10710   }
10711 
10712   // r0 = input (float16)
10713   // v0 = result (float)
10714   // v1 = temporary float register
10715   address generate_float16ToFloat() {
10716     StubId stub_id = StubId::stubgen_hf2f_id;
10717     int entry_count = StubInfo::entry_count(stub_id);
10718     assert(entry_count == 1, "sanity check");
10719     address start = load_archive_data(stub_id);
10720     if (start != nullptr) {
10721       return start;
10722     }
10723     __ align(CodeEntryAlignment);
10724     StubCodeMark mark(this, stub_id);
10725     address entry = __ pc();
10726     BLOCK_COMMENT("Entry:");
10727     __ flt16_to_flt(v0, r0, v1);
10728     __ ret(lr);
10729 
10730     // record the stub entry and end
10731     store_archive_data(stub_id, entry, __ pc());
10732 
10733     return entry;
10734   }
10735 
10736   // v0 = input (float)
10737   // r0 = result (float16)
10738   // v1 = temporary float register
10739   address generate_floatToFloat16() {
10740     StubId stub_id = StubId::stubgen_f2hf_id;
10741     int entry_count = StubInfo::entry_count(stub_id);
10742     assert(entry_count == 1, "sanity check");
10743     address start = load_archive_data(stub_id);
10744     if (start != nullptr) {
10745       return start;
10746     }
10747     __ align(CodeEntryAlignment);
10748     StubCodeMark mark(this, stub_id);
10749     address entry = __ pc();
10750     BLOCK_COMMENT("Entry:");
10751     __ flt_to_flt16(r0, v0, v1);
10752     __ ret(lr);
10753 
10754     // record the stub entry and end
10755     store_archive_data(stub_id, entry, __ pc());
10756 
10757     return entry;
10758   }
10759 
10760   address generate_method_entry_barrier() {
10761     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
10762     int entry_count = StubInfo::entry_count(stub_id);
10763     assert(entry_count == 1, "sanity check");
10764     address start = load_archive_data(stub_id);
10765     if (start != nullptr) {
10766       return start;
10767     }
10768     __ align(CodeEntryAlignment);
10769     StubCodeMark mark(this, stub_id);
10770 
10771     Label deoptimize_label;
10772 
10773     start = __ pc();
10774 
10775     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
10776 
10777     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
10778       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
10779       // We can get here despite the nmethod being good, if we have not
10780       // yet applied our cross modification fence (or data fence).
10781       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
10782       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
10783       __ ldrw(rscratch2, rscratch2);
10784       __ strw(rscratch2, thread_epoch_addr);
10785       __ isb();
10786       __ membar(__ LoadLoad);
10787     }
10788 
10789     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
10790 
10791     __ enter();
10792     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
10793 
10794     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
10795 
10796     __ push_call_clobbered_registers();
10797 
10798     __ mov(c_rarg0, rscratch2);
10799     __ call_VM_leaf
10800          (CAST_FROM_FN_PTR
10801           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
10802 
10803     __ reset_last_Java_frame(true);
10804 
10805     __ mov(rscratch1, r0);
10806 
10807     __ pop_call_clobbered_registers();
10808 
10809     __ cbnz(rscratch1, deoptimize_label);
10810 
10811     __ leave();
10812     __ ret(lr);
10813 
10814     __ BIND(deoptimize_label);
10815 
10816     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
10817     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
10818 
10819     __ mov(sp, rscratch1);
10820     __ br(rscratch2);
10821 
10822     // record the stub entry and end
10823     store_archive_data(stub_id, start, __ pc());
10824 
10825     return start;
10826   }
10827 
10828   // r0  = result
10829   // r1  = str1
10830   // r2  = cnt1
10831   // r3  = str2
10832   // r4  = cnt2
10833   // r10 = tmp1
10834   // r11 = tmp2
10835   address generate_compare_long_string_same_encoding(bool isLL) {
10836     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
10837     int entry_count = StubInfo::entry_count(stub_id);
10838     assert(entry_count == 1, "sanity check");
10839     address start = load_archive_data(stub_id);
10840     if (start != nullptr) {
10841       return start;
10842     }
10843     __ align(CodeEntryAlignment);
10844     StubCodeMark mark(this, stub_id);
10845     address entry = __ pc();
10846     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10847         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
10848 
10849     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
10850 
10851     // exit from large loop when less than 64 bytes left to read or we're about
10852     // to prefetch memory behind array border
10853     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
10854 
10855     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
10856     __ eor(rscratch2, tmp1, tmp2);
10857     __ cbnz(rscratch2, CAL_DIFFERENCE);
10858 
10859     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
10860     // update pointers, because of previous read
10861     __ add(str1, str1, wordSize);
10862     __ add(str2, str2, wordSize);
10863     if (SoftwarePrefetchHintDistance >= 0) {
10864       __ align(OptoLoopAlignment);
10865       __ bind(LARGE_LOOP_PREFETCH);
10866         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
10867         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
10868 
10869         for (int i = 0; i < 4; i++) {
10870           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
10871           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
10872           __ cmp(tmp1, tmp2);
10873           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10874           __ br(Assembler::NE, DIFF);
10875         }
10876         __ sub(cnt2, cnt2, isLL ? 64 : 32);
10877         __ add(str1, str1, 64);
10878         __ add(str2, str2, 64);
10879         __ subs(rscratch2, cnt2, largeLoopExitCondition);
10880         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
10881         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
10882     }
10883 
10884     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
10885     __ br(Assembler::LE, LESS16);
10886     __ align(OptoLoopAlignment);
10887     __ bind(LOOP_COMPARE16);
10888       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
10889       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
10890       __ cmp(tmp1, tmp2);
10891       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10892       __ br(Assembler::NE, DIFF);
10893       __ sub(cnt2, cnt2, isLL ? 16 : 8);
10894       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
10895       __ br(Assembler::LT, LESS16);
10896 
10897       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
10898       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
10899       __ cmp(tmp1, tmp2);
10900       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10901       __ br(Assembler::NE, DIFF);
10902       __ sub(cnt2, cnt2, isLL ? 16 : 8);
10903       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
10904       __ br(Assembler::GE, LOOP_COMPARE16);
10905       __ cbz(cnt2, LENGTH_DIFF);
10906 
10907     __ bind(LESS16);
10908       // each 8 compare
10909       __ subs(cnt2, cnt2, isLL ? 8 : 4);
10910       __ br(Assembler::LE, LESS8);
10911       __ ldr(tmp1, Address(__ post(str1, 8)));
10912       __ ldr(tmp2, Address(__ post(str2, 8)));
10913       __ eor(rscratch2, tmp1, tmp2);
10914       __ cbnz(rscratch2, CAL_DIFFERENCE);
10915       __ sub(cnt2, cnt2, isLL ? 8 : 4);
10916 
10917     __ bind(LESS8); // directly load last 8 bytes
10918       if (!isLL) {
10919         __ add(cnt2, cnt2, cnt2);
10920       }
10921       __ ldr(tmp1, Address(str1, cnt2));
10922       __ ldr(tmp2, Address(str2, cnt2));
10923       __ eor(rscratch2, tmp1, tmp2);
10924       __ cbz(rscratch2, LENGTH_DIFF);
10925       __ b(CAL_DIFFERENCE);
10926 
10927     __ bind(DIFF);
10928       __ cmp(tmp1, tmp2);
10929       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
10930       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
10931       // reuse rscratch2 register for the result of eor instruction
10932       __ eor(rscratch2, tmp1, tmp2);
10933 
10934     __ bind(CAL_DIFFERENCE);
10935       __ rev(rscratch2, rscratch2);
10936       __ clz(rscratch2, rscratch2);
10937       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
10938       __ lsrv(tmp1, tmp1, rscratch2);
10939       __ lsrv(tmp2, tmp2, rscratch2);
10940       if (isLL) {
10941         __ uxtbw(tmp1, tmp1);
10942         __ uxtbw(tmp2, tmp2);
10943       } else {
10944         __ uxthw(tmp1, tmp1);
10945         __ uxthw(tmp2, tmp2);
10946       }
10947       __ subw(result, tmp1, tmp2);
10948 
10949     __ bind(LENGTH_DIFF);
10950       __ ret(lr);
10951 
10952     // record the stub entry and end
10953     store_archive_data(stub_id, entry, __ pc());
10954 
10955     return entry;
10956   }
10957 
10958   enum string_compare_mode {
10959     LL,
10960     LU,
10961     UL,
10962     UU,
10963   };
10964 
10965   // The following registers are declared in aarch64.ad
10966   // r0  = result
10967   // r1  = str1
10968   // r2  = cnt1
10969   // r3  = str2
10970   // r4  = cnt2
10971   // r10 = tmp1
10972   // r11 = tmp2
10973   // z0  = ztmp1
10974   // z1  = ztmp2
10975   // p0  = pgtmp1
10976   // p1  = pgtmp2
10977   address generate_compare_long_string_sve(string_compare_mode mode) {
10978     StubId stub_id;
10979     switch (mode) {
10980       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
10981       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
10982       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
10983       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
10984       default: ShouldNotReachHere();
10985     }
10986     int entry_count = StubInfo::entry_count(stub_id);
10987     assert(entry_count == 1, "sanity check");
10988     address start = load_archive_data(stub_id);
10989     if (start != nullptr) {
10990       return start;
10991     }
10992     __ align(CodeEntryAlignment);
10993     StubCodeMark mark(this, stub_id);
10994     address entry = __ pc();
10995     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10996              tmp1 = r10, tmp2 = r11;
10997 
10998     Label LOOP, DONE, MISMATCH;
10999     Register vec_len = tmp1;
11000     Register idx = tmp2;
11001     // The minimum of the string lengths has been stored in cnt2.
11002     Register cnt = cnt2;
11003     FloatRegister ztmp1 = z0, ztmp2 = z1;
11004     PRegister pgtmp1 = p0, pgtmp2 = p1;
11005 
11006 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
11007     switch (mode) {                                                            \
11008       case LL:                                                                 \
11009         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
11010         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
11011         break;                                                                 \
11012       case LU:                                                                 \
11013         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
11014         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
11015         break;                                                                 \
11016       case UL:                                                                 \
11017         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
11018         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
11019         break;                                                                 \
11020       case UU:                                                                 \
11021         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
11022         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
11023         break;                                                                 \
11024       default:                                                                 \
11025         ShouldNotReachHere();                                                  \
11026     }
11027 
11028     __ mov(idx, 0);
11029     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
11030 
11031     if (mode == LL) {
11032       __ sve_cntb(vec_len);
11033     } else {
11034       __ sve_cnth(vec_len);
11035     }
11036 
11037     __ sub(rscratch1, cnt, vec_len);
11038 
11039     __ bind(LOOP);
11040 
11041       // main loop
11042       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
11043       __ add(idx, idx, vec_len);
11044       // Compare strings.
11045       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
11046       __ br(__ NE, MISMATCH);
11047       __ cmp(idx, rscratch1);
11048       __ br(__ LT, LOOP);
11049 
11050     // post loop, last iteration
11051     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
11052 
11053     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
11054     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
11055     __ br(__ EQ, DONE);
11056 
11057     __ bind(MISMATCH);
11058 
11059     // Crop the vector to find its location.
11060     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
11061     // Extract the first different characters of each string.
11062     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
11063     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
11064 
11065     // Compute the difference of the first different characters.
11066     __ sub(result, rscratch1, rscratch2);
11067 
11068     __ bind(DONE);
11069     __ ret(lr);
11070 #undef LOAD_PAIR
11071 
11072     // record the stub entry and end
11073     store_archive_data(stub_id, entry, __ pc());
11074 
11075     return entry;
11076   }
11077 
11078   void generate_compare_long_strings() {
11079     if (UseSVE == 0) {
11080       StubRoutines::aarch64::_compare_long_string_LL
11081           = generate_compare_long_string_same_encoding(true);
11082       StubRoutines::aarch64::_compare_long_string_UU
11083           = generate_compare_long_string_same_encoding(false);
11084       StubRoutines::aarch64::_compare_long_string_LU
11085           = generate_compare_long_string_different_encoding(true);
11086       StubRoutines::aarch64::_compare_long_string_UL
11087           = generate_compare_long_string_different_encoding(false);
11088     } else {
11089       StubRoutines::aarch64::_compare_long_string_LL
11090           = generate_compare_long_string_sve(LL);
11091       StubRoutines::aarch64::_compare_long_string_UU
11092           = generate_compare_long_string_sve(UU);
11093       StubRoutines::aarch64::_compare_long_string_LU
11094           = generate_compare_long_string_sve(LU);
11095       StubRoutines::aarch64::_compare_long_string_UL
11096           = generate_compare_long_string_sve(UL);
11097     }
11098   }
11099 
11100   // R0 = result
11101   // R1 = str2
11102   // R2 = cnt1
11103   // R3 = str1
11104   // R4 = cnt2
11105   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
11106   //
11107   // This generic linear code use few additional ideas, which makes it faster:
11108   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
11109   // in order to skip initial loading(help in systems with 1 ld pipeline)
11110   // 2) we can use "fast" algorithm of finding single character to search for
11111   // first symbol with less branches(1 branch per each loaded register instead
11112   // of branch for each symbol), so, this is where constants like
11113   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
11114   // 3) after loading and analyzing 1st register of source string, it can be
11115   // used to search for every 1st character entry, saving few loads in
11116   // comparison with "simplier-but-slower" implementation
11117   // 4) in order to avoid lots of push/pop operations, code below is heavily
11118   // re-using/re-initializing/compressing register values, which makes code
11119   // larger and a bit less readable, however, most of extra operations are
11120   // issued during loads or branches, so, penalty is minimal
11121   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
11122     StubId stub_id;
11123     if (str1_isL) {
11124       if (str2_isL) {
11125         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
11126       } else {
11127         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
11128       }
11129     } else {
11130       if (str2_isL) {
11131         ShouldNotReachHere();
11132       } else {
11133         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
11134       }
11135     }
11136     int entry_count = StubInfo::entry_count(stub_id);
11137     assert(entry_count == 1, "sanity check");
11138     address start = load_archive_data(stub_id);
11139     if (start != nullptr) {
11140       return start;
11141     }
11142     __ align(CodeEntryAlignment);
11143     StubCodeMark mark(this, stub_id);
11144     address entry = __ pc();
11145 
11146     int str1_chr_size = str1_isL ? 1 : 2;
11147     int str2_chr_size = str2_isL ? 1 : 2;
11148     int str1_chr_shift = str1_isL ? 0 : 1;
11149     int str2_chr_shift = str2_isL ? 0 : 1;
11150     bool isL = str1_isL && str2_isL;
11151    // parameters
11152     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
11153     // temporary registers
11154     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
11155     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
11156     // redefinitions
11157     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
11158 
11159     __ push(spilled_regs, sp);
11160     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
11161         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
11162         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
11163         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
11164         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
11165         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
11166     // Read whole register from str1. It is safe, because length >=8 here
11167     __ ldr(ch1, Address(str1));
11168     // Read whole register from str2. It is safe, because length >=8 here
11169     __ ldr(ch2, Address(str2));
11170     __ sub(cnt2, cnt2, cnt1);
11171     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
11172     if (str1_isL != str2_isL) {
11173       __ eor(v0, __ T16B, v0, v0);
11174     }
11175     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
11176     __ mul(first, first, tmp1);
11177     // check if we have less than 1 register to check
11178     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
11179     if (str1_isL != str2_isL) {
11180       __ fmovd(v1, ch1);
11181     }
11182     __ br(__ LE, L_SMALL);
11183     __ eor(ch2, first, ch2);
11184     if (str1_isL != str2_isL) {
11185       __ zip1(v1, __ T16B, v1, v0);
11186     }
11187     __ sub(tmp2, ch2, tmp1);
11188     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11189     __ bics(tmp2, tmp2, ch2);
11190     if (str1_isL != str2_isL) {
11191       __ fmovd(ch1, v1);
11192     }
11193     __ br(__ NE, L_HAS_ZERO);
11194     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
11195     __ add(result, result, wordSize/str2_chr_size);
11196     __ add(str2, str2, wordSize);
11197     __ br(__ LT, L_POST_LOOP);
11198     __ BIND(L_LOOP);
11199       __ ldr(ch2, Address(str2));
11200       __ eor(ch2, first, ch2);
11201       __ sub(tmp2, ch2, tmp1);
11202       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11203       __ bics(tmp2, tmp2, ch2);
11204       __ br(__ NE, L_HAS_ZERO);
11205     __ BIND(L_LOOP_PROCEED);
11206       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
11207       __ add(str2, str2, wordSize);
11208       __ add(result, result, wordSize/str2_chr_size);
11209       __ br(__ GE, L_LOOP);
11210     __ BIND(L_POST_LOOP);
11211       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
11212       __ br(__ LE, NOMATCH);
11213       __ ldr(ch2, Address(str2));
11214       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
11215       __ eor(ch2, first, ch2);
11216       __ sub(tmp2, ch2, tmp1);
11217       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11218       __ mov(tmp4, -1); // all bits set
11219       __ b(L_SMALL_PROCEED);
11220     __ align(OptoLoopAlignment);
11221     __ BIND(L_SMALL);
11222       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
11223       __ eor(ch2, first, ch2);
11224       if (str1_isL != str2_isL) {
11225         __ zip1(v1, __ T16B, v1, v0);
11226       }
11227       __ sub(tmp2, ch2, tmp1);
11228       __ mov(tmp4, -1); // all bits set
11229       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11230       if (str1_isL != str2_isL) {
11231         __ fmovd(ch1, v1); // move converted 4 symbols
11232       }
11233     __ BIND(L_SMALL_PROCEED);
11234       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
11235       __ bic(tmp2, tmp2, ch2);
11236       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
11237       __ rbit(tmp2, tmp2);
11238       __ br(__ EQ, NOMATCH);
11239     __ BIND(L_SMALL_HAS_ZERO_LOOP);
11240       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
11241       __ cmp(cnt1, u1(wordSize/str2_chr_size));
11242       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
11243       if (str2_isL) { // LL
11244         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
11245         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
11246         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
11247         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
11248         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11249       } else {
11250         __ mov(ch2, 0xE); // all bits in byte set except last one
11251         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11252         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11253         __ lslv(tmp2, tmp2, tmp4);
11254         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11255         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11256         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11257         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11258       }
11259       __ cmp(ch1, ch2);
11260       __ mov(tmp4, wordSize/str2_chr_size);
11261       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11262     __ BIND(L_SMALL_CMP_LOOP);
11263       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
11264                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
11265       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
11266                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
11267       __ add(tmp4, tmp4, 1);
11268       __ cmp(tmp4, cnt1);
11269       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
11270       __ cmp(first, ch2);
11271       __ br(__ EQ, L_SMALL_CMP_LOOP);
11272     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
11273       __ cbz(tmp2, NOMATCH); // no more matches. exit
11274       __ clz(tmp4, tmp2);
11275       __ add(result, result, 1); // advance index
11276       __ add(str2, str2, str2_chr_size); // advance pointer
11277       __ b(L_SMALL_HAS_ZERO_LOOP);
11278     __ align(OptoLoopAlignment);
11279     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
11280       __ cmp(first, ch2);
11281       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11282       __ b(DONE);
11283     __ align(OptoLoopAlignment);
11284     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
11285       if (str2_isL) { // LL
11286         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
11287         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
11288         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
11289         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
11290         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11291       } else {
11292         __ mov(ch2, 0xE); // all bits in byte set except last one
11293         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11294         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11295         __ lslv(tmp2, tmp2, tmp4);
11296         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11297         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11298         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11299         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11300       }
11301       __ cmp(ch1, ch2);
11302       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11303       __ b(DONE);
11304     __ align(OptoLoopAlignment);
11305     __ BIND(L_HAS_ZERO);
11306       __ rbit(tmp2, tmp2);
11307       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
11308       // Now, perform compression of counters(cnt2 and cnt1) into one register.
11309       // It's fine because both counters are 32bit and are not changed in this
11310       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
11311       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
11312       __ sub(result, result, 1);
11313     __ BIND(L_HAS_ZERO_LOOP);
11314       __ mov(cnt1, wordSize/str2_chr_size);
11315       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
11316       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
11317       if (str2_isL) {
11318         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
11319         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11320         __ lslv(tmp2, tmp2, tmp4);
11321         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11322         __ add(tmp4, tmp4, 1);
11323         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11324         __ lsl(tmp2, tmp2, 1);
11325         __ mov(tmp4, wordSize/str2_chr_size);
11326       } else {
11327         __ mov(ch2, 0xE);
11328         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11329         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11330         __ lslv(tmp2, tmp2, tmp4);
11331         __ add(tmp4, tmp4, 1);
11332         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11333         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
11334         __ lsl(tmp2, tmp2, 1);
11335         __ mov(tmp4, wordSize/str2_chr_size);
11336         __ sub(str2, str2, str2_chr_size);
11337       }
11338       __ cmp(ch1, ch2);
11339       __ mov(tmp4, wordSize/str2_chr_size);
11340       __ br(__ NE, L_CMP_LOOP_NOMATCH);
11341     __ BIND(L_CMP_LOOP);
11342       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
11343                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
11344       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
11345                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
11346       __ add(tmp4, tmp4, 1);
11347       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
11348       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
11349       __ cmp(cnt1, ch2);
11350       __ br(__ EQ, L_CMP_LOOP);
11351     __ BIND(L_CMP_LOOP_NOMATCH);
11352       // here we're not matched
11353       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
11354       __ clz(tmp4, tmp2);
11355       __ add(str2, str2, str2_chr_size); // advance pointer
11356       __ b(L_HAS_ZERO_LOOP);
11357     __ align(OptoLoopAlignment);
11358     __ BIND(L_CMP_LOOP_LAST_CMP);
11359       __ cmp(cnt1, ch2);
11360       __ br(__ NE, L_CMP_LOOP_NOMATCH);
11361       __ b(DONE);
11362     __ align(OptoLoopAlignment);
11363     __ BIND(L_CMP_LOOP_LAST_CMP2);
11364       if (str2_isL) {
11365         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
11366         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11367         __ lslv(tmp2, tmp2, tmp4);
11368         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11369         __ add(tmp4, tmp4, 1);
11370         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11371         __ lsl(tmp2, tmp2, 1);
11372       } else {
11373         __ mov(ch2, 0xE);
11374         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11375         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11376         __ lslv(tmp2, tmp2, tmp4);
11377         __ add(tmp4, tmp4, 1);
11378         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11379         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
11380         __ lsl(tmp2, tmp2, 1);
11381         __ sub(str2, str2, str2_chr_size);
11382       }
11383       __ cmp(ch1, ch2);
11384       __ br(__ NE, L_CMP_LOOP_NOMATCH);
11385       __ b(DONE);
11386     __ align(OptoLoopAlignment);
11387     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
11388       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
11389       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
11390       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
11391       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
11392       // result by analyzed characters value, so, we can just reset lower bits
11393       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
11394       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
11395       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
11396       // index of last analyzed substring inside current octet. So, str2 in at
11397       // respective start address. We need to advance it to next octet
11398       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
11399       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
11400       __ bfm(result, zr, 0, 2 - str2_chr_shift);
11401       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
11402       __ movw(cnt2, cnt2);
11403       __ b(L_LOOP_PROCEED);
11404     __ align(OptoLoopAlignment);
11405     __ BIND(NOMATCH);
11406       __ mov(result, -1);
11407     __ BIND(DONE);
11408       __ pop(spilled_regs, sp);
11409       __ ret(lr);
11410 
11411     // record the stub entry and end
11412     store_archive_data(stub_id, entry, __ pc());
11413 
11414     return entry;
11415   }
11416 
11417   void generate_string_indexof_stubs() {
11418     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
11419     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
11420     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
11421   }
11422 
11423   void inflate_and_store_2_fp_registers(bool generatePrfm,
11424       FloatRegister src1, FloatRegister src2) {
11425     Register dst = r1;
11426     __ zip1(v1, __ T16B, src1, v0);
11427     __ zip2(v2, __ T16B, src1, v0);
11428     if (generatePrfm) {
11429       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
11430     }
11431     __ zip1(v3, __ T16B, src2, v0);
11432     __ zip2(v4, __ T16B, src2, v0);
11433     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
11434   }
11435 
11436   // R0 = src
11437   // R1 = dst
11438   // R2 = len
11439   // R3 = len >> 3
11440   // V0 = 0
11441   // v1 = loaded 8 bytes
11442   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
11443   address generate_large_byte_array_inflate() {
11444     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
11445     int entry_count = StubInfo::entry_count(stub_id);
11446     assert(entry_count == 1, "sanity check");
11447     address start = load_archive_data(stub_id);
11448     if (start != nullptr) {
11449       return start;
11450     }
11451     __ align(CodeEntryAlignment);
11452     StubCodeMark mark(this, stub_id);
11453     address entry = __ pc();
11454     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
11455     Register src = r0, dst = r1, len = r2, octetCounter = r3;
11456     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
11457 
11458     // do one more 8-byte read to have address 16-byte aligned in most cases
11459     // also use single store instruction
11460     __ ldrd(v2, __ post(src, 8));
11461     __ sub(octetCounter, octetCounter, 2);
11462     __ zip1(v1, __ T16B, v1, v0);
11463     __ zip1(v2, __ T16B, v2, v0);
11464     __ st1(v1, v2, __ T16B, __ post(dst, 32));
11465     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11466     __ subs(rscratch1, octetCounter, large_loop_threshold);
11467     __ br(__ LE, LOOP_START);
11468     __ b(LOOP_PRFM_START);
11469     __ bind(LOOP_PRFM);
11470       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11471     __ bind(LOOP_PRFM_START);
11472       __ prfm(Address(src, SoftwarePrefetchHintDistance));
11473       __ sub(octetCounter, octetCounter, 8);
11474       __ subs(rscratch1, octetCounter, large_loop_threshold);
11475       inflate_and_store_2_fp_registers(true, v3, v4);
11476       inflate_and_store_2_fp_registers(true, v5, v6);
11477       __ br(__ GT, LOOP_PRFM);
11478       __ cmp(octetCounter, (u1)8);
11479       __ br(__ LT, DONE);
11480     __ bind(LOOP);
11481       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11482       __ bind(LOOP_START);
11483       __ sub(octetCounter, octetCounter, 8);
11484       __ cmp(octetCounter, (u1)8);
11485       inflate_and_store_2_fp_registers(false, v3, v4);
11486       inflate_and_store_2_fp_registers(false, v5, v6);
11487       __ br(__ GE, LOOP);
11488     __ bind(DONE);
11489       __ ret(lr);
11490 
11491     // record the stub entry and end
11492     store_archive_data(stub_id, entry, __ pc());
11493 
11494     return entry;
11495   }
11496 
11497   /**
11498    *  Arguments:
11499    *
11500    *  Input:
11501    *  c_rarg0   - current state address
11502    *  c_rarg1   - H key address
11503    *  c_rarg2   - data address
11504    *  c_rarg3   - number of blocks
11505    *
11506    *  Output:
11507    *  Updated state at c_rarg0
11508    */
11509   address generate_ghash_processBlocks_small() {
11510     // Bafflingly, GCM uses little-endian for the byte order, but
11511     // big-endian for the bit order.  For example, the polynomial 1 is
11512     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
11513     //
11514     // So, we must either reverse the bytes in each word and do
11515     // everything big-endian or reverse the bits in each byte and do
11516     // it little-endian.  On AArch64 it's more idiomatic to reverse
11517     // the bits in each byte (we have an instruction, RBIT, to do
11518     // that) and keep the data in little-endian bit order through the
11519     // calculation, bit-reversing the inputs and outputs.
11520 
11521     StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
11522     int entry_count = StubInfo::entry_count(stub_id);
11523     assert(entry_count == 1, "sanity check");
11524     address start = load_archive_data(stub_id);
11525     if (start != nullptr) {
11526       return start;
11527     }
11528     __ align(CodeEntryAlignment);
11529     StubCodeMark mark(this, stub_id);
11530     Label polynomial; // local data generated at end of stub
11531     start = __ pc();
11532 
11533     Register state   = c_rarg0;
11534     Register subkeyH = c_rarg1;
11535     Register data    = c_rarg2;
11536     Register blocks  = c_rarg3;
11537 
11538     FloatRegister vzr = v30;
11539     __ eor(vzr, __ T16B, vzr, vzr); // zero register
11540 
11541     __ adr(rscratch1, polynomial);
11542     __ ldrq(v24, rscratch1);    // The field polynomial
11543 
11544     __ ldrq(v0, Address(state));
11545     __ ldrq(v1, Address(subkeyH));
11546 
11547     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
11548     __ rbit(v0, __ T16B, v0);
11549     __ rev64(v1, __ T16B, v1);
11550     __ rbit(v1, __ T16B, v1);
11551 
11552     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
11553     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
11554 
11555     {
11556       Label L_ghash_loop;
11557       __ bind(L_ghash_loop);
11558 
11559       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
11560                                                  // reversing each byte
11561       __ rbit(v2, __ T16B, v2);
11562       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
11563 
11564       // Multiply state in v2 by subkey in v1
11565       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
11566                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
11567                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
11568       // Reduce v7:v5 by the field polynomial
11569       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
11570 
11571       __ sub(blocks, blocks, 1);
11572       __ cbnz(blocks, L_ghash_loop);
11573     }
11574 
11575     // The bit-reversed result is at this point in v0
11576     __ rev64(v0, __ T16B, v0);
11577     __ rbit(v0, __ T16B, v0);
11578 
11579     __ st1(v0, __ T16B, state);
11580     __ ret(lr);
11581 
11582     // bind label and generate local polynomial data
11583     __ align(wordSize * 2);
11584     __ bind(polynomial);
11585     __ emit_int64(0x87);  // The low-order bits of the field
11586                           // polynomial (i.e. p = z^7+z^2+z+1)
11587                           // repeated in the low and high parts of a
11588                           // 128-bit vector
11589     __ emit_int64(0x87);
11590 
11591     // record the stub entry and end
11592     store_archive_data(stub_id, start, __ pc());
11593 
11594     return start;
11595   }
11596 
11597   address generate_ghash_processBlocks(address small) {
11598     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
11599     int entry_count = StubInfo::entry_count(stub_id);
11600     assert(entry_count == 1, "sanity check");
11601     address start = load_archive_data(stub_id);
11602     if (start != nullptr) {
11603       return start;
11604     }
11605     Label polynomial;           // local data generated after stub
11606     __ align(CodeEntryAlignment);
11607     StubCodeMark mark(this, stub_id);
11608     start = __ pc();
11609 
11610     Register state   = c_rarg0;
11611     Register subkeyH = c_rarg1;
11612     Register data    = c_rarg2;
11613     Register blocks  = c_rarg3;
11614 
11615     const int unroll = 4;
11616 
11617     __ cmp(blocks, (unsigned char)(unroll * 2));
11618     __ br(__ LT, small);
11619 
11620     if (unroll > 1) {
11621     // Save state before entering routine
11622       __ sub(sp, sp, 4 * 16);
11623       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
11624       __ sub(sp, sp, 4 * 16);
11625       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
11626     }
11627 
11628     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
11629 
11630     if (unroll > 1) {
11631       // And restore state
11632       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
11633       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
11634     }
11635 
11636     __ cmp(blocks, (unsigned char)0);
11637     __ br(__ GT, small);
11638 
11639     __ ret(lr);
11640 
11641     // bind label and generate polynomial data
11642     __ align(wordSize * 2);
11643     __ bind(polynomial);
11644     __ emit_int64(0x87);  // The low-order bits of the field
11645                           // polynomial (i.e. p = z^7+z^2+z+1)
11646                           // repeated in the low and high parts of a
11647                           // 128-bit vector
11648     __ emit_int64(0x87);
11649 
11650     // record the stub entry and end
11651     store_archive_data(stub_id, start, __ pc());
11652 
11653     return start;
11654   }
11655 
11656   void generate_base64_encode_simdround(Register src, Register dst,
11657         FloatRegister codec, u8 size) {
11658 
11659     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
11660     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
11661     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
11662 
11663     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
11664 
11665     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
11666 
11667     __ ushr(ind0, arrangement, in0,  2);
11668 
11669     __ ushr(ind1, arrangement, in1,  2);
11670     __ shl(in0,   arrangement, in0,  6);
11671     __ orr(ind1,  arrangement, ind1, in0);
11672     __ ushr(ind1, arrangement, ind1, 2);
11673 
11674     __ ushr(ind2, arrangement, in2,  4);
11675     __ shl(in1,   arrangement, in1,  4);
11676     __ orr(ind2,  arrangement, in1,  ind2);
11677     __ ushr(ind2, arrangement, ind2, 2);
11678 
11679     __ shl(ind3,  arrangement, in2,  2);
11680     __ ushr(ind3, arrangement, ind3, 2);
11681 
11682     __ tbl(out0,  arrangement, codec,  4, ind0);
11683     __ tbl(out1,  arrangement, codec,  4, ind1);
11684     __ tbl(out2,  arrangement, codec,  4, ind2);
11685     __ tbl(out3,  arrangement, codec,  4, ind3);
11686 
11687     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
11688   }
11689 
11690    /**
11691    *  Arguments:
11692    *
11693    *  Input:
11694    *  c_rarg0   - src_start
11695    *  c_rarg1   - src_offset
11696    *  c_rarg2   - src_length
11697    *  c_rarg3   - dest_start
11698    *  c_rarg4   - dest_offset
11699    *  c_rarg5   - isURL
11700    *
11701    */
11702   address generate_base64_encodeBlock() {
11703 
11704     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
11705     int entry_count = StubInfo::entry_count(stub_id);
11706     assert(entry_count == 1, "sanity check");
11707     address start = load_archive_data(stub_id);
11708     if (start != nullptr) {
11709       return start;
11710     }
11711     __ align(CodeEntryAlignment);
11712     StubCodeMark mark(this, stub_id);
11713     start = __ pc();
11714 
11715     Register src   = c_rarg0;  // source array
11716     Register soff  = c_rarg1;  // source start offset
11717     Register send  = c_rarg2;  // source end offset
11718     Register dst   = c_rarg3;  // dest array
11719     Register doff  = c_rarg4;  // position for writing to dest array
11720     Register isURL = c_rarg5;  // Base64 or URL character set
11721 
11722     // c_rarg6 and c_rarg7 are free to use as temps
11723     Register codec  = c_rarg6;
11724     Register length = c_rarg7;
11725 
11726     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
11727 
11728     __ add(src, src, soff);
11729     __ add(dst, dst, doff);
11730     __ sub(length, send, soff);
11731 
11732     // load the codec base address
11733     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
11734     __ cbz(isURL, ProcessData);
11735     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
11736 
11737     __ BIND(ProcessData);
11738 
11739     // too short to formup a SIMD loop, roll back
11740     __ cmp(length, (u1)24);
11741     __ br(Assembler::LT, Process3B);
11742 
11743     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
11744 
11745     __ BIND(Process48B);
11746     __ cmp(length, (u1)48);
11747     __ br(Assembler::LT, Process24B);
11748     generate_base64_encode_simdround(src, dst, v0, 16);
11749     __ sub(length, length, 48);
11750     __ b(Process48B);
11751 
11752     __ BIND(Process24B);
11753     __ cmp(length, (u1)24);
11754     __ br(Assembler::LT, SIMDExit);
11755     generate_base64_encode_simdround(src, dst, v0, 8);
11756     __ sub(length, length, 24);
11757 
11758     __ BIND(SIMDExit);
11759     __ cbz(length, Exit);
11760 
11761     __ BIND(Process3B);
11762     //  3 src bytes, 24 bits
11763     __ ldrb(r10, __ post(src, 1));
11764     __ ldrb(r11, __ post(src, 1));
11765     __ ldrb(r12, __ post(src, 1));
11766     __ orrw(r11, r11, r10, Assembler::LSL, 8);
11767     __ orrw(r12, r12, r11, Assembler::LSL, 8);
11768     // codec index
11769     __ ubfmw(r15, r12, 18, 23);
11770     __ ubfmw(r14, r12, 12, 17);
11771     __ ubfmw(r13, r12, 6,  11);
11772     __ andw(r12,  r12, 63);
11773     // get the code based on the codec
11774     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
11775     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
11776     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
11777     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
11778     __ strb(r15, __ post(dst, 1));
11779     __ strb(r14, __ post(dst, 1));
11780     __ strb(r13, __ post(dst, 1));
11781     __ strb(r12, __ post(dst, 1));
11782     __ sub(length, length, 3);
11783     __ cbnz(length, Process3B);
11784 
11785     __ BIND(Exit);
11786     __ ret(lr);
11787 
11788     // record the stub entry and end
11789     store_archive_data(stub_id, start, __ pc());
11790 
11791     return start;
11792   }
11793 
11794   void generate_base64_decode_simdround(Register src, Register dst,
11795         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
11796 
11797     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
11798     FloatRegister out0 = v20, out1 = v21, out2 = v22;
11799 
11800     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
11801     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
11802 
11803     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
11804 
11805     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
11806 
11807     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
11808 
11809     // we need unsigned saturating subtract, to make sure all input values
11810     // in range [0, 63] will have 0U value in the higher half lookup
11811     __ uqsubv(decH0, __ T16B, in0, v27);
11812     __ uqsubv(decH1, __ T16B, in1, v27);
11813     __ uqsubv(decH2, __ T16B, in2, v27);
11814     __ uqsubv(decH3, __ T16B, in3, v27);
11815 
11816     // lower half lookup
11817     __ tbl(decL0, arrangement, codecL, 4, in0);
11818     __ tbl(decL1, arrangement, codecL, 4, in1);
11819     __ tbl(decL2, arrangement, codecL, 4, in2);
11820     __ tbl(decL3, arrangement, codecL, 4, in3);
11821 
11822     // higher half lookup
11823     __ tbx(decH0, arrangement, codecH, 4, decH0);
11824     __ tbx(decH1, arrangement, codecH, 4, decH1);
11825     __ tbx(decH2, arrangement, codecH, 4, decH2);
11826     __ tbx(decH3, arrangement, codecH, 4, decH3);
11827 
11828     // combine lower and higher
11829     __ orr(decL0, arrangement, decL0, decH0);
11830     __ orr(decL1, arrangement, decL1, decH1);
11831     __ orr(decL2, arrangement, decL2, decH2);
11832     __ orr(decL3, arrangement, decL3, decH3);
11833 
11834     // check illegal inputs, value larger than 63 (maximum of 6 bits)
11835     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
11836     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
11837     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
11838     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
11839     __ orr(in0, arrangement, decH0, decH1);
11840     __ orr(in1, arrangement, decH2, decH3);
11841     __ orr(in2, arrangement, in0,   in1);
11842     __ umaxv(in3, arrangement, in2);
11843     __ umov(rscratch2, in3, __ B, 0);
11844 
11845     // get the data to output
11846     __ shl(out0,  arrangement, decL0, 2);
11847     __ ushr(out1, arrangement, decL1, 4);
11848     __ orr(out0,  arrangement, out0,  out1);
11849     __ shl(out1,  arrangement, decL1, 4);
11850     __ ushr(out2, arrangement, decL2, 2);
11851     __ orr(out1,  arrangement, out1,  out2);
11852     __ shl(out2,  arrangement, decL2, 6);
11853     __ orr(out2,  arrangement, out2,  decL3);
11854 
11855     __ cbz(rscratch2, NoIllegalData);
11856 
11857     // handle illegal input
11858     __ umov(r10, in2, __ D, 0);
11859     if (size == 16) {
11860       __ cbnz(r10, ErrorInLowerHalf);
11861 
11862       // illegal input is in higher half, store the lower half now.
11863       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
11864 
11865       __ umov(r10, in2,  __ D, 1);
11866       __ umov(r11, out0, __ D, 1);
11867       __ umov(r12, out1, __ D, 1);
11868       __ umov(r13, out2, __ D, 1);
11869       __ b(StoreLegalData);
11870 
11871       __ BIND(ErrorInLowerHalf);
11872     }
11873     __ umov(r11, out0, __ D, 0);
11874     __ umov(r12, out1, __ D, 0);
11875     __ umov(r13, out2, __ D, 0);
11876 
11877     __ BIND(StoreLegalData);
11878     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
11879     __ strb(r11, __ post(dst, 1));
11880     __ strb(r12, __ post(dst, 1));
11881     __ strb(r13, __ post(dst, 1));
11882     __ lsr(r10, r10, 8);
11883     __ lsr(r11, r11, 8);
11884     __ lsr(r12, r12, 8);
11885     __ lsr(r13, r13, 8);
11886     __ b(StoreLegalData);
11887 
11888     __ BIND(NoIllegalData);
11889     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
11890   }
11891 
11892 
11893    /**
11894    *  Arguments:
11895    *
11896    *  Input:
11897    *  c_rarg0   - src_start
11898    *  c_rarg1   - src_offset
11899    *  c_rarg2   - src_length
11900    *  c_rarg3   - dest_start
11901    *  c_rarg4   - dest_offset
11902    *  c_rarg5   - isURL
11903    *  c_rarg6   - isMIME
11904    *
11905    */
11906   address generate_base64_decodeBlock() {
11907 
11908     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
11909     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
11910     // titled "Base64 decoding".
11911 
11912     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
11913     int entry_count = StubInfo::entry_count(stub_id);
11914     assert(entry_count == 1, "sanity check");
11915     address start = load_archive_data(stub_id);
11916     if (start != nullptr) {
11917       return start;
11918     }
11919     __ align(CodeEntryAlignment);
11920     StubCodeMark mark(this, stub_id);
11921     start = __ pc();
11922 
11923     Register src    = c_rarg0;  // source array
11924     Register soff   = c_rarg1;  // source start offset
11925     Register send   = c_rarg2;  // source end offset
11926     Register dst    = c_rarg3;  // dest array
11927     Register doff   = c_rarg4;  // position for writing to dest array
11928     Register isURL  = c_rarg5;  // Base64 or URL character set
11929     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
11930 
11931     Register length = send;    // reuse send as length of source data to process
11932 
11933     Register simd_codec   = c_rarg6;
11934     Register nosimd_codec = c_rarg7;
11935 
11936     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
11937 
11938     __ enter();
11939 
11940     __ add(src, src, soff);
11941     __ add(dst, dst, doff);
11942 
11943     __ mov(doff, dst);
11944 
11945     __ sub(length, send, soff);
11946     __ bfm(length, zr, 0, 1);
11947 
11948     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
11949     __ cbz(isURL, ProcessData);
11950     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
11951 
11952     __ BIND(ProcessData);
11953     __ mov(rscratch1, length);
11954     __ cmp(length, (u1)144); // 144 = 80 + 64
11955     __ br(Assembler::LT, Process4B);
11956 
11957     // In the MIME case, the line length cannot be more than 76
11958     // bytes (see RFC 2045). This is too short a block for SIMD
11959     // to be worthwhile, so we use non-SIMD here.
11960     __ movw(rscratch1, 79);
11961 
11962     __ BIND(Process4B);
11963     __ ldrw(r14, __ post(src, 4));
11964     __ ubfxw(r10, r14, 0,  8);
11965     __ ubfxw(r11, r14, 8,  8);
11966     __ ubfxw(r12, r14, 16, 8);
11967     __ ubfxw(r13, r14, 24, 8);
11968     // get the de-code
11969     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
11970     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
11971     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
11972     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
11973     // error detection, 255u indicates an illegal input
11974     __ orrw(r14, r10, r11);
11975     __ orrw(r15, r12, r13);
11976     __ orrw(r14, r14, r15);
11977     __ tbnz(r14, 7, Exit);
11978     // recover the data
11979     __ lslw(r14, r10, 10);
11980     __ bfiw(r14, r11, 4, 6);
11981     __ bfmw(r14, r12, 2, 5);
11982     __ rev16w(r14, r14);
11983     __ bfiw(r13, r12, 6, 2);
11984     __ strh(r14, __ post(dst, 2));
11985     __ strb(r13, __ post(dst, 1));
11986     // non-simd loop
11987     __ subsw(rscratch1, rscratch1, 4);
11988     __ br(Assembler::GT, Process4B);
11989 
11990     // if exiting from PreProcess80B, rscratch1 == -1;
11991     // otherwise, rscratch1 == 0.
11992     __ cbzw(rscratch1, Exit);
11993     __ sub(length, length, 80);
11994 
11995     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
11996     __ cbz(isURL, SIMDEnter);
11997     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
11998 
11999     __ BIND(SIMDEnter);
12000     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
12001     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
12002     __ mov(rscratch1, 63);
12003     __ dup(v27, __ T16B, rscratch1);
12004 
12005     __ BIND(Process64B);
12006     __ cmp(length, (u1)64);
12007     __ br(Assembler::LT, Process32B);
12008     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
12009     __ sub(length, length, 64);
12010     __ b(Process64B);
12011 
12012     __ BIND(Process32B);
12013     __ cmp(length, (u1)32);
12014     __ br(Assembler::LT, SIMDExit);
12015     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
12016     __ sub(length, length, 32);
12017     __ b(Process32B);
12018 
12019     __ BIND(SIMDExit);
12020     __ cbz(length, Exit);
12021     __ movw(rscratch1, length);
12022     __ b(Process4B);
12023 
12024     __ BIND(Exit);
12025     __ sub(c_rarg0, dst, doff);
12026 
12027     __ leave();
12028     __ ret(lr);
12029 
12030     // record the stub entry and end
12031     store_archive_data(stub_id, start, __ pc());
12032 
12033     return start;
12034   }
12035 
12036   // Support for spin waits.
12037   address generate_spin_wait() {
12038     StubId stub_id = StubId::stubgen_spin_wait_id;
12039     int entry_count = StubInfo::entry_count(stub_id);
12040     assert(entry_count == 1, "sanity check");
12041     address start = load_archive_data(stub_id);
12042     if (start != nullptr) {
12043       return start;
12044     }
12045     __ align(CodeEntryAlignment);
12046     StubCodeMark mark(this, stub_id);
12047     start = __ pc();
12048 
12049     __ spin_wait();
12050     __ ret(lr);
12051 
12052     // record the stub entry and end
12053     store_archive_data(stub_id, start, __ pc());
12054 
12055     return start;
12056   }
12057 
12058   void generate_lookup_secondary_supers_table_stub() {
12059     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
12060     GrowableArray<address> entries;
12061     int entry_count = StubInfo::entry_count(stub_id);
12062     assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
12063     address start = load_archive_data(stub_id, &entries);
12064     if (start != nullptr) {
12065       assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
12066              "unexpected extra entry count %d", entries.length());
12067       StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
12068       for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
12069         StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
12070       }
12071       return;
12072     }
12073 
12074     StubCodeMark mark(this, stub_id);
12075 
12076     const Register
12077       r_super_klass  = r0,
12078       r_array_base   = r1,
12079       r_array_length = r2,
12080       r_array_index  = r3,
12081       r_sub_klass    = r4,
12082       r_bitmap       = rscratch2,
12083       result         = r5;
12084     const FloatRegister
12085       vtemp          = v0;
12086 
12087     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
12088       address next_entry = __ pc();
12089       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
12090       if (slot == 0) {
12091         start = next_entry;
12092       } else {
12093         entries.append(next_entry);
12094       }
12095       Label L_success;
12096       __ enter();
12097       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
12098                                              r_array_base, r_array_length, r_array_index,
12099                                              vtemp, result, slot,
12100                                              /*stub_is_near*/true);
12101       __ leave();
12102       __ ret(lr);
12103     }
12104     // record the stub entry and end plus all the auxiliary entries
12105     store_archive_data(stub_id, start, __ pc(), &entries);
12106   }
12107 
12108   // Slow path implementation for UseSecondarySupersTable.
12109   address generate_lookup_secondary_supers_table_slow_path_stub() {
12110     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
12111     int entry_count = StubInfo::entry_count(stub_id);
12112     assert(entry_count == 1, "sanity check");
12113     address start = load_archive_data(stub_id);
12114     if (start != nullptr) {
12115       return start;
12116     }
12117     StubCodeMark mark(this, stub_id);
12118     start = __ pc();
12119     const Register
12120       r_super_klass  = r0,        // argument
12121       r_array_base   = r1,        // argument
12122       temp1          = r2,        // temp
12123       r_array_index  = r3,        // argument
12124       r_bitmap       = rscratch2, // argument
12125       result         = r5;        // argument
12126 
12127     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
12128     __ ret(lr);
12129 
12130     // record the stub entry and end
12131     store_archive_data(stub_id, start, __ pc());
12132 
12133     return start;
12134   }
12135 
12136 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12137 
12138   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
12139   //
12140   // If LSE is in use, generate LSE versions of all the stubs. The
12141   // non-LSE versions are in atomic_aarch64.S.
12142 
12143   // class AtomicStubMark records the entry point of a stub and the
12144   // stub pointer which will point to it. The stub pointer is set to
12145   // the entry point when ~AtomicStubMark() is called, which must be
12146   // after ICache::invalidate_range. This ensures safe publication of
12147   // the generated code.
12148   class AtomicStubMark {
12149     address _entry_point;
12150     aarch64_atomic_stub_t *_stub;
12151     MacroAssembler *_masm;
12152   public:
12153     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
12154       _masm = masm;
12155       __ align(32);
12156       _entry_point = __ pc();
12157       _stub = stub;
12158     }
12159     ~AtomicStubMark() {
12160       *_stub = (aarch64_atomic_stub_t)_entry_point;
12161     }
12162   };
12163 
12164   // NB: For memory_order_conservative we need a trailing membar after
12165   // LSE atomic operations but not a leading membar.
12166   //
12167   // We don't need a leading membar because a clause in the Arm ARM
12168   // says:
12169   //
12170   //   Barrier-ordered-before
12171   //
12172   //   Barrier instructions order prior Memory effects before subsequent
12173   //   Memory effects generated by the same Observer. A read or a write
12174   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
12175   //   Observer if and only if RW1 appears in program order before RW 2
12176   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
12177   //   instruction with both Acquire and Release semantics.
12178   //
12179   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
12180   // and Release semantics, therefore we don't need a leading
12181   // barrier. However, there is no corresponding Barrier-ordered-after
12182   // relationship, therefore we need a trailing membar to prevent a
12183   // later store or load from being reordered with the store in an
12184   // atomic instruction.
12185   //
12186   // This was checked by using the herd7 consistency model simulator
12187   // (http://diy.inria.fr/) with this test case:
12188   //
12189   // AArch64 LseCas
12190   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
12191   // P0 | P1;
12192   // LDR W4, [X2] | MOV W3, #0;
12193   // DMB LD       | MOV W4, #1;
12194   // LDR W3, [X1] | CASAL W3, W4, [X1];
12195   //              | DMB ISH;
12196   //              | STR W4, [X2];
12197   // exists
12198   // (0:X3=0 /\ 0:X4=1)
12199   //
12200   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
12201   // with the store to x in P1. Without the DMB in P1 this may happen.
12202   //
12203   // At the time of writing we don't know of any AArch64 hardware that
12204   // reorders stores in this way, but the Reference Manual permits it.
12205 
12206   void gen_cas_entry(Assembler::operand_size size,
12207                      atomic_memory_order order) {
12208     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
12209       exchange_val = c_rarg2;
12210     bool acquire, release;
12211     switch (order) {
12212       case memory_order_relaxed:
12213         acquire = false;
12214         release = false;
12215         break;
12216       case memory_order_release:
12217         acquire = false;
12218         release = true;
12219         break;
12220       default:
12221         acquire = true;
12222         release = true;
12223         break;
12224     }
12225     __ mov(prev, compare_val);
12226     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
12227     if (order == memory_order_conservative) {
12228       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12229     }
12230     if (size == Assembler::xword) {
12231       __ mov(r0, prev);
12232     } else {
12233       __ movw(r0, prev);
12234     }
12235     __ ret(lr);
12236   }
12237 
12238   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
12239     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
12240     // If not relaxed, then default to conservative.  Relaxed is the only
12241     // case we use enough to be worth specializing.
12242     if (order == memory_order_relaxed) {
12243       __ ldadd(size, incr, prev, addr);
12244     } else {
12245       __ ldaddal(size, incr, prev, addr);
12246       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12247     }
12248     if (size == Assembler::xword) {
12249       __ mov(r0, prev);
12250     } else {
12251       __ movw(r0, prev);
12252     }
12253     __ ret(lr);
12254   }
12255 
12256   void gen_swpal_entry(Assembler::operand_size size) {
12257     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
12258     __ swpal(size, incr, prev, addr);
12259     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12260     if (size == Assembler::xword) {
12261       __ mov(r0, prev);
12262     } else {
12263       __ movw(r0, prev);
12264     }
12265     __ ret(lr);
12266   }
12267 
12268   void generate_atomic_entry_points() {
12269     if (! UseLSE) {
12270       return;
12271     }
12272     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
12273     GrowableArray<address> entries;
12274     int entry_count = StubInfo::entry_count(stub_id);
12275     address start = load_archive_data(stub_id, &entries);
12276     if (start != nullptr) {
12277       assert(entries.length() == entry_count - 1,
12278              "unexpected extra entry count %d", entries.length());
12279       aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
12280       int idx = 0;
12281       aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12282       aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12283       aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12284       aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12285       aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12286       aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12287       aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12288       aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12289       aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12290       aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12291       aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12292       aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12293       aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12294       aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12295       aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12296       assert(idx == entries.length(), "sanity!");
12297       return;
12298     }
12299 
12300     __ align(CodeEntryAlignment);
12301     StubCodeMark mark(this, stub_id);
12302     start = __ pc();
12303     address end;
12304     {
12305     // ADD, memory_order_conservative
12306     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
12307     gen_ldadd_entry(Assembler::word, memory_order_conservative);
12308 
12309     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
12310     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
12311 
12312     // ADD, memory_order_relaxed
12313     AtomicStubMark mark_fetch_add_4_relaxed
12314       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
12315     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
12316 
12317     AtomicStubMark mark_fetch_add_8_relaxed
12318       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
12319     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
12320 
12321     // XCHG, memory_order_conservative
12322     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
12323     gen_swpal_entry(Assembler::word);
12324 
12325     AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
12326     gen_swpal_entry(Assembler::xword);
12327 
12328     // CAS, memory_order_conservative
12329     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
12330     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
12331 
12332     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
12333     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
12334 
12335     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
12336     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
12337 
12338     // CAS, memory_order_relaxed
12339     AtomicStubMark mark_cmpxchg_1_relaxed
12340       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
12341     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
12342 
12343     AtomicStubMark mark_cmpxchg_4_relaxed
12344       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
12345     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
12346 
12347     AtomicStubMark mark_cmpxchg_8_relaxed
12348       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
12349     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
12350 
12351     AtomicStubMark mark_cmpxchg_4_release
12352       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
12353     gen_cas_entry(MacroAssembler::word, memory_order_release);
12354 
12355     AtomicStubMark mark_cmpxchg_8_release
12356       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
12357     gen_cas_entry(MacroAssembler::xword, memory_order_release);
12358 
12359     AtomicStubMark mark_cmpxchg_4_seq_cst
12360       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
12361     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
12362 
12363     AtomicStubMark mark_cmpxchg_8_seq_cst
12364       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
12365     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
12366 
12367     end = __ pc();
12368 
12369     ICache::invalidate_range(start, end - start);
12370     // exit block to force update of AtomicStubMark targets
12371     }
12372 
12373     assert(start == (address)aarch64_atomic_fetch_add_4_impl,
12374            "atomic stub should be at start of buffer");
12375     // record the stub start and end plus all the entries saved by the
12376     // AtomicStubMark destructor
12377     entries.append((address)aarch64_atomic_fetch_add_8_impl);
12378     entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
12379     entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
12380     entries.append((address)aarch64_atomic_xchg_4_impl);
12381     entries.append((address)aarch64_atomic_xchg_8_impl);
12382     entries.append((address)aarch64_atomic_cmpxchg_1_impl);
12383     entries.append((address)aarch64_atomic_cmpxchg_4_impl);
12384     entries.append((address)aarch64_atomic_cmpxchg_8_impl);
12385     entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
12386     entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
12387     entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
12388     entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
12389     entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
12390     entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
12391     entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
12392 
12393     assert(entries.length() == entry_count - 1,
12394            "unexpected extra entry count %d", entries.length());
12395 
12396     store_archive_data(stub_id, start, end, &entries);
12397   }
12398 #endif // LINUX
12399 
12400   address generate_cont_thaw(Continuation::thaw_kind kind) {
12401     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
12402     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
12403 
12404     address start = __ pc();
12405 
12406     if (return_barrier) {
12407       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
12408       __ mov(sp, rscratch1);
12409     }
12410     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
12411 
12412     if (return_barrier) {
12413       // preserve possible return value from a method returning to the return barrier
12414       __ fmovd(rscratch1, v0);
12415       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
12416     }
12417 
12418     __ movw(c_rarg1, (return_barrier ? 1 : 0));
12419     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
12420     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
12421 
12422     if (return_barrier) {
12423       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
12424       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
12425       __ fmovd(v0, rscratch1);
12426     }
12427     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
12428 
12429 
12430     Label thaw_success;
12431     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
12432     __ cbnz(rscratch2, thaw_success);
12433     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
12434     __ br(rscratch1);
12435     __ bind(thaw_success);
12436 
12437     // make room for the thawed frames
12438     __ sub(rscratch1, sp, rscratch2);
12439     __ andr(rscratch1, rscratch1, -16); // align
12440     __ mov(sp, rscratch1);
12441 
12442     if (return_barrier) {
12443       // save original return value -- again
12444       __ fmovd(rscratch1, v0);
12445       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
12446     }
12447 
12448     // If we want, we can templatize thaw by kind, and have three different entries
12449     __ movw(c_rarg1, (uint32_t)kind);
12450 
12451     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
12452     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
12453 
12454     if (return_barrier) {
12455       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
12456       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
12457       __ fmovd(v0, rscratch1);
12458     } else {
12459       __ mov(r0, zr); // return 0 (success) from doYield
12460     }
12461 
12462     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
12463     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
12464     __ mov(rfp, sp);
12465 
12466     if (return_barrier_exception) {
12467       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
12468       __ authenticate_return_address(c_rarg1);
12469       __ verify_oop(r0);
12470       // save return value containing the exception oop in callee-saved R19
12471       __ mov(r19, r0);
12472 
12473       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
12474 
12475       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
12476       // __ reinitialize_ptrue();
12477 
12478       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
12479 
12480       __ mov(r1, r0); // the exception handler
12481       __ mov(r0, r19); // restore return value containing the exception oop
12482       __ verify_oop(r0);
12483 
12484       __ leave();
12485       __ mov(r3, lr);
12486       __ br(r1); // the exception handler
12487     } else {
12488       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
12489       __ leave();
12490       __ ret(lr);
12491     }
12492 
12493     return start;
12494   }
12495 
12496   address generate_cont_thaw() {
12497     if (!Continuations::enabled()) return nullptr;
12498 
12499     StubId stub_id = StubId::stubgen_cont_thaw_id;
12500     int entry_count = StubInfo::entry_count(stub_id);
12501     assert(entry_count == 1, "sanity check");
12502     address start = load_archive_data(stub_id);
12503     if (start != nullptr) {
12504       return start;
12505     }
12506     StubCodeMark mark(this, stub_id);
12507     start = __ pc();
12508     generate_cont_thaw(Continuation::thaw_top);
12509 
12510     // record the stub start and end
12511     store_archive_data(stub_id, start, __ pc());
12512 
12513     return start;
12514   }
12515 
12516   address generate_cont_returnBarrier() {
12517     if (!Continuations::enabled()) return nullptr;
12518 
12519     // TODO: will probably need multiple return barriers depending on return type
12520     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
12521     int entry_count = StubInfo::entry_count(stub_id);
12522     assert(entry_count == 1, "sanity check");
12523     address start = load_archive_data(stub_id);
12524     if (start != nullptr) {
12525       return start;
12526     }
12527     StubCodeMark mark(this, stub_id);
12528     start = __ pc();
12529 
12530     generate_cont_thaw(Continuation::thaw_return_barrier);
12531 
12532     // record the stub start and end
12533     store_archive_data(stub_id, start, __ pc());
12534 
12535     return start;
12536   }
12537 
12538   address generate_cont_returnBarrier_exception() {
12539     if (!Continuations::enabled()) return nullptr;
12540 
12541     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
12542     int entry_count = StubInfo::entry_count(stub_id);
12543     assert(entry_count == 1, "sanity check");
12544     address start = load_archive_data(stub_id);
12545     if (start != nullptr) {
12546       return start;
12547     }
12548     StubCodeMark mark(this, stub_id);
12549     start = __ pc();
12550 
12551     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
12552 
12553     // record the stub start and end
12554     store_archive_data(stub_id, start, __ pc());
12555 
12556     return start;
12557   }
12558 
12559   address generate_cont_preempt_stub() {
12560     if (!Continuations::enabled()) return nullptr;
12561     StubId stub_id = StubId::stubgen_cont_preempt_id;
12562     int entry_count = StubInfo::entry_count(stub_id);
12563     assert(entry_count == 1, "sanity check");
12564     address start = load_archive_data(stub_id);
12565     if (start != nullptr) {
12566       return start;
12567     }
12568     StubCodeMark mark(this, stub_id);
12569     start = __ pc();
12570 
12571     __ reset_last_Java_frame(true);
12572 
12573     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
12574     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
12575     __ mov(sp, rscratch2);
12576 
12577     Label preemption_cancelled;
12578     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
12579     __ cbnz(rscratch1, preemption_cancelled);
12580 
12581     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
12582     SharedRuntime::continuation_enter_cleanup(_masm);
12583     __ leave();
12584     __ ret(lr);
12585 
12586     // We acquired the monitor after freezing the frames so call thaw to continue execution.
12587     __ bind(preemption_cancelled);
12588     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
12589     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
12590     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
12591     __ ldr(rscratch1, Address(rscratch1));
12592     __ br(rscratch1);
12593 
12594     // record the stub start and end
12595     store_archive_data(stub_id, start, __ pc());
12596 
12597     return start;
12598   }
12599 
12600   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
12601   // are represented as long[5], with BITS_PER_LIMB = 26.
12602   // Pack five 26-bit limbs into three 64-bit registers.
12603   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
12604     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
12605     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
12606     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
12607     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
12608 
12609     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
12610     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
12611     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
12612     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
12613 
12614     if (dest2->is_valid()) {
12615       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
12616     } else {
12617 #ifdef ASSERT
12618       Label OK;
12619       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
12620       __ br(__ EQ, OK);
12621       __ stop("high bits of Poly1305 integer should be zero");
12622       __ should_not_reach_here();
12623       __ bind(OK);
12624 #endif
12625     }
12626   }
12627 
12628   // As above, but return only a 128-bit integer, packed into two
12629   // 64-bit registers.
12630   void pack_26(Register dest0, Register dest1, Register src) {
12631     pack_26(dest0, dest1, noreg, src);
12632   }
12633 
12634   // Multiply and multiply-accumulate unsigned 64-bit registers.
12635   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
12636     __ mul(prod_lo, n, m);
12637     __ umulh(prod_hi, n, m);
12638   }
12639   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
12640     wide_mul(rscratch1, rscratch2, n, m);
12641     __ adds(sum_lo, sum_lo, rscratch1);
12642     __ adc(sum_hi, sum_hi, rscratch2);
12643   }
12644 
12645   // Poly1305, RFC 7539
12646 
12647   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
12648   // description of the tricks used to simplify and accelerate this
12649   // computation.
12650 
12651   address generate_poly1305_processBlocks() {
12652     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
12653     int entry_count = StubInfo::entry_count(stub_id);
12654     assert(entry_count == 1, "sanity check");
12655     address start = load_archive_data(stub_id);
12656     if (start != nullptr) {
12657       return start;
12658     }
12659     __ align(CodeEntryAlignment);
12660     StubCodeMark mark(this, stub_id);
12661     start = __ pc();
12662     Label here;
12663     __ enter();
12664     RegSet callee_saved = RegSet::range(r19, r28);
12665     __ push(callee_saved, sp);
12666 
12667     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
12668 
12669     // Arguments
12670     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
12671 
12672     // R_n is the 128-bit randomly-generated key, packed into two
12673     // registers.  The caller passes this key to us as long[5], with
12674     // BITS_PER_LIMB = 26.
12675     const Register R_0 = *++regs, R_1 = *++regs;
12676     pack_26(R_0, R_1, r_start);
12677 
12678     // RR_n is (R_n >> 2) * 5
12679     const Register RR_0 = *++regs, RR_1 = *++regs;
12680     __ lsr(RR_0, R_0, 2);
12681     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
12682     __ lsr(RR_1, R_1, 2);
12683     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
12684 
12685     // U_n is the current checksum
12686     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
12687     pack_26(U_0, U_1, U_2, acc_start);
12688 
12689     static constexpr int BLOCK_LENGTH = 16;
12690     Label DONE, LOOP;
12691 
12692     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
12693     __ br(Assembler::LT, DONE); {
12694       __ bind(LOOP);
12695 
12696       // S_n is to be the sum of U_n and the next block of data
12697       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
12698       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
12699       __ adds(S_0, U_0, S_0);
12700       __ adcs(S_1, U_1, S_1);
12701       __ adc(S_2, U_2, zr);
12702       __ add(S_2, S_2, 1);
12703 
12704       const Register U_0HI = *++regs, U_1HI = *++regs;
12705 
12706       // NB: this logic depends on some of the special properties of
12707       // Poly1305 keys. In particular, because we know that the top
12708       // four bits of R_0 and R_1 are zero, we can add together
12709       // partial products without any risk of needing to propagate a
12710       // carry out.
12711       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
12712       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
12713       __ andr(U_2, R_0, 3);
12714       __ mul(U_2, S_2, U_2);
12715 
12716       // Recycle registers S_0, S_1, S_2
12717       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
12718 
12719       // Partial reduction mod 2**130 - 5
12720       __ adds(U_1, U_0HI, U_1);
12721       __ adc(U_2, U_1HI, U_2);
12722       // Sum now in U_2:U_1:U_0.
12723       // Dead: U_0HI, U_1HI.
12724       regs = (regs.remaining() + U_0HI + U_1HI).begin();
12725 
12726       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
12727 
12728       // First, U_2:U_1:U_0 += (U_2 >> 2)
12729       __ lsr(rscratch1, U_2, 2);
12730       __ andr(U_2, U_2, (u8)3);
12731       __ adds(U_0, U_0, rscratch1);
12732       __ adcs(U_1, U_1, zr);
12733       __ adc(U_2, U_2, zr);
12734       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
12735       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
12736       __ adcs(U_1, U_1, zr);
12737       __ adc(U_2, U_2, zr);
12738 
12739       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
12740       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
12741       __ br(~ Assembler::LT, LOOP);
12742     }
12743 
12744     // Further reduce modulo 2^130 - 5
12745     __ lsr(rscratch1, U_2, 2);
12746     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
12747     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
12748     __ adcs(U_1, U_1, zr);
12749     __ andr(U_2, U_2, (u1)3);
12750     __ adc(U_2, U_2, zr);
12751 
12752     // Unpack the sum into five 26-bit limbs and write to memory.
12753     __ ubfiz(rscratch1, U_0, 0, 26);
12754     __ ubfx(rscratch2, U_0, 26, 26);
12755     __ stp(rscratch1, rscratch2, Address(acc_start));
12756     __ ubfx(rscratch1, U_0, 52, 12);
12757     __ bfi(rscratch1, U_1, 12, 14);
12758     __ ubfx(rscratch2, U_1, 14, 26);
12759     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
12760     __ ubfx(rscratch1, U_1, 40, 24);
12761     __ bfi(rscratch1, U_2, 24, 3);
12762     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
12763 
12764     __ bind(DONE);
12765     __ pop(callee_saved, sp);
12766     __ leave();
12767     __ ret(lr);
12768 
12769     // record the stub start and end
12770     store_archive_data(stub_id, start, __ pc());
12771 
12772     return start;
12773   }
12774 
12775   // exception handler for upcall stubs
12776   address generate_upcall_stub_exception_handler() {
12777     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
12778     int entry_count = StubInfo::entry_count(stub_id);
12779     assert(entry_count == 1, "sanity check");
12780     address start = load_archive_data(stub_id);
12781     if (start != nullptr) {
12782       return start;
12783     }
12784     StubCodeMark mark(this, stub_id);
12785     start = __ pc();
12786 
12787     // Native caller has no idea how to handle exceptions,
12788     // so we just crash here. Up to callee to catch exceptions.
12789     __ verify_oop(r0);
12790     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
12791     __ blr(rscratch1);
12792     __ should_not_reach_here();
12793 
12794     // record the stub start and end
12795     store_archive_data(stub_id, start, __ pc());
12796 
12797     return start;
12798   }
12799 
12800   // load Method* target of MethodHandle
12801   // j_rarg0 = jobject receiver
12802   // rmethod = result
12803   address generate_upcall_stub_load_target() {
12804     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
12805     int entry_count = StubInfo::entry_count(stub_id);
12806     assert(entry_count == 1, "sanity check");
12807     address start = load_archive_data(stub_id);
12808     if (start != nullptr) {
12809       return start;
12810     }
12811     StubCodeMark mark(this, stub_id);
12812     start = __ pc();
12813 
12814     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
12815       // Load target method from receiver
12816     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
12817     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
12818     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
12819     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
12820                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
12821                       noreg, noreg);
12822     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
12823 
12824     __ ret(lr);
12825 
12826     // record the stub start and end
12827     store_archive_data(stub_id, start, __ pc());
12828 
12829     return start;
12830   }
12831 
12832 #undef __
12833 #define __ masm->
12834 
12835   class MontgomeryMultiplyGenerator : public MacroAssembler {
12836 
12837     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
12838       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
12839 
12840     RegSet _toSave;
12841     bool _squaring;
12842 
12843   public:
12844     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
12845       : MacroAssembler(as->code()), _squaring(squaring) {
12846 
12847       // Register allocation
12848 
12849       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
12850       Pa_base = *regs;       // Argument registers
12851       if (squaring)
12852         Pb_base = Pa_base;
12853       else
12854         Pb_base = *++regs;
12855       Pn_base = *++regs;
12856       Rlen= *++regs;
12857       inv = *++regs;
12858       Pm_base = *++regs;
12859 
12860                           // Working registers:
12861       Ra =  *++regs;        // The current digit of a, b, n, and m.
12862       Rb =  *++regs;
12863       Rm =  *++regs;
12864       Rn =  *++regs;
12865 
12866       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
12867       Pb =  *++regs;
12868       Pm =  *++regs;
12869       Pn =  *++regs;
12870 
12871       t0 =  *++regs;        // Three registers which form a
12872       t1 =  *++regs;        // triple-precision accumuator.
12873       t2 =  *++regs;
12874 
12875       Ri =  *++regs;        // Inner and outer loop indexes.
12876       Rj =  *++regs;
12877 
12878       Rhi_ab = *++regs;     // Product registers: low and high parts
12879       Rlo_ab = *++regs;     // of a*b and m*n.
12880       Rhi_mn = *++regs;
12881       Rlo_mn = *++regs;
12882 
12883       // r19 and up are callee-saved.
12884       _toSave = RegSet::range(r19, *regs) + Pm_base;
12885     }
12886 
12887   private:
12888     void save_regs() {
12889       push(_toSave, sp);
12890     }
12891 
12892     void restore_regs() {
12893       pop(_toSave, sp);
12894     }
12895 
12896     template <typename T>
12897     void unroll_2(Register count, T block) {
12898       Label loop, end, odd;
12899       tbnz(count, 0, odd);
12900       cbz(count, end);
12901       align(16);
12902       bind(loop);
12903       (this->*block)();
12904       bind(odd);
12905       (this->*block)();
12906       subs(count, count, 2);
12907       br(Assembler::GT, loop);
12908       bind(end);
12909     }
12910 
12911     template <typename T>
12912     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
12913       Label loop, end, odd;
12914       tbnz(count, 0, odd);
12915       cbz(count, end);
12916       align(16);
12917       bind(loop);
12918       (this->*block)(d, s, tmp);
12919       bind(odd);
12920       (this->*block)(d, s, tmp);
12921       subs(count, count, 2);
12922       br(Assembler::GT, loop);
12923       bind(end);
12924     }
12925 
12926     void pre1(RegisterOrConstant i) {
12927       block_comment("pre1");
12928       // Pa = Pa_base;
12929       // Pb = Pb_base + i;
12930       // Pm = Pm_base;
12931       // Pn = Pn_base + i;
12932       // Ra = *Pa;
12933       // Rb = *Pb;
12934       // Rm = *Pm;
12935       // Rn = *Pn;
12936       ldr(Ra, Address(Pa_base));
12937       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
12938       ldr(Rm, Address(Pm_base));
12939       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12940       lea(Pa, Address(Pa_base));
12941       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
12942       lea(Pm, Address(Pm_base));
12943       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12944 
12945       // Zero the m*n result.
12946       mov(Rhi_mn, zr);
12947       mov(Rlo_mn, zr);
12948     }
12949 
12950     // The core multiply-accumulate step of a Montgomery
12951     // multiplication.  The idea is to schedule operations as a
12952     // pipeline so that instructions with long latencies (loads and
12953     // multiplies) have time to complete before their results are
12954     // used.  This most benefits in-order implementations of the
12955     // architecture but out-of-order ones also benefit.
12956     void step() {
12957       block_comment("step");
12958       // MACC(Ra, Rb, t0, t1, t2);
12959       // Ra = *++Pa;
12960       // Rb = *--Pb;
12961       umulh(Rhi_ab, Ra, Rb);
12962       mul(Rlo_ab, Ra, Rb);
12963       ldr(Ra, pre(Pa, wordSize));
12964       ldr(Rb, pre(Pb, -wordSize));
12965       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
12966                                        // previous iteration.
12967       // MACC(Rm, Rn, t0, t1, t2);
12968       // Rm = *++Pm;
12969       // Rn = *--Pn;
12970       umulh(Rhi_mn, Rm, Rn);
12971       mul(Rlo_mn, Rm, Rn);
12972       ldr(Rm, pre(Pm, wordSize));
12973       ldr(Rn, pre(Pn, -wordSize));
12974       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12975     }
12976 
12977     void post1() {
12978       block_comment("post1");
12979 
12980       // MACC(Ra, Rb, t0, t1, t2);
12981       // Ra = *++Pa;
12982       // Rb = *--Pb;
12983       umulh(Rhi_ab, Ra, Rb);
12984       mul(Rlo_ab, Ra, Rb);
12985       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
12986       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12987 
12988       // *Pm = Rm = t0 * inv;
12989       mul(Rm, t0, inv);
12990       str(Rm, Address(Pm));
12991 
12992       // MACC(Rm, Rn, t0, t1, t2);
12993       // t0 = t1; t1 = t2; t2 = 0;
12994       umulh(Rhi_mn, Rm, Rn);
12995 
12996 #ifndef PRODUCT
12997       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12998       {
12999         mul(Rlo_mn, Rm, Rn);
13000         add(Rlo_mn, t0, Rlo_mn);
13001         Label ok;
13002         cbz(Rlo_mn, ok); {
13003           stop("broken Montgomery multiply");
13004         } bind(ok);
13005       }
13006 #endif
13007       // We have very carefully set things up so that
13008       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
13009       // the lower half of Rm * Rn because we know the result already:
13010       // it must be -t0.  t0 + (-t0) must generate a carry iff
13011       // t0 != 0.  So, rather than do a mul and an adds we just set
13012       // the carry flag iff t0 is nonzero.
13013       //
13014       // mul(Rlo_mn, Rm, Rn);
13015       // adds(zr, t0, Rlo_mn);
13016       subs(zr, t0, 1); // Set carry iff t0 is nonzero
13017       adcs(t0, t1, Rhi_mn);
13018       adc(t1, t2, zr);
13019       mov(t2, zr);
13020     }
13021 
13022     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
13023       block_comment("pre2");
13024       // Pa = Pa_base + i-len;
13025       // Pb = Pb_base + len;
13026       // Pm = Pm_base + i-len;
13027       // Pn = Pn_base + len;
13028 
13029       if (i.is_register()) {
13030         sub(Rj, i.as_register(), len);
13031       } else {
13032         mov(Rj, i.as_constant());
13033         sub(Rj, Rj, len);
13034       }
13035       // Rj == i-len
13036 
13037       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
13038       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
13039       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
13040       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
13041 
13042       // Ra = *++Pa;
13043       // Rb = *--Pb;
13044       // Rm = *++Pm;
13045       // Rn = *--Pn;
13046       ldr(Ra, pre(Pa, wordSize));
13047       ldr(Rb, pre(Pb, -wordSize));
13048       ldr(Rm, pre(Pm, wordSize));
13049       ldr(Rn, pre(Pn, -wordSize));
13050 
13051       mov(Rhi_mn, zr);
13052       mov(Rlo_mn, zr);
13053     }
13054 
13055     void post2(RegisterOrConstant i, RegisterOrConstant len) {
13056       block_comment("post2");
13057       if (i.is_constant()) {
13058         mov(Rj, i.as_constant()-len.as_constant());
13059       } else {
13060         sub(Rj, i.as_register(), len);
13061       }
13062 
13063       adds(t0, t0, Rlo_mn); // The pending m*n, low part
13064 
13065       // As soon as we know the least significant digit of our result,
13066       // store it.
13067       // Pm_base[i-len] = t0;
13068       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
13069 
13070       // t0 = t1; t1 = t2; t2 = 0;
13071       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
13072       adc(t1, t2, zr);
13073       mov(t2, zr);
13074     }
13075 
13076     // A carry in t0 after Montgomery multiplication means that we
13077     // should subtract multiples of n from our result in m.  We'll
13078     // keep doing that until there is no carry.
13079     void normalize(RegisterOrConstant len) {
13080       block_comment("normalize");
13081       // while (t0)
13082       //   t0 = sub(Pm_base, Pn_base, t0, len);
13083       Label loop, post, again;
13084       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
13085       cbz(t0, post); {
13086         bind(again); {
13087           mov(i, zr);
13088           mov(cnt, len);
13089           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13090           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
13091           subs(zr, zr, zr); // set carry flag, i.e. no borrow
13092           align(16);
13093           bind(loop); {
13094             sbcs(Rm, Rm, Rn);
13095             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13096             add(i, i, 1);
13097             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13098             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
13099             sub(cnt, cnt, 1);
13100           } cbnz(cnt, loop);
13101           sbc(t0, t0, zr);
13102         } cbnz(t0, again);
13103       } bind(post);
13104     }
13105 
13106     // Move memory at s to d, reversing words.
13107     //    Increments d to end of copied memory
13108     //    Destroys tmp1, tmp2
13109     //    Preserves len
13110     //    Leaves s pointing to the address which was in d at start
13111     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
13112       assert(tmp1->encoding() < r19->encoding(), "register corruption");
13113       assert(tmp2->encoding() < r19->encoding(), "register corruption");
13114 
13115       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
13116       mov(tmp1, len);
13117       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
13118       sub(s, d, len, ext::uxtw, LogBytesPerWord);
13119     }
13120     // where
13121     void reverse1(Register d, Register s, Register tmp) {
13122       ldr(tmp, pre(s, -wordSize));
13123       ror(tmp, tmp, 32);
13124       str(tmp, post(d, wordSize));
13125     }
13126 
13127     void step_squaring() {
13128       // An extra ACC
13129       step();
13130       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13131     }
13132 
13133     void last_squaring(RegisterOrConstant i) {
13134       Label dont;
13135       // if ((i & 1) == 0) {
13136       tbnz(i.as_register(), 0, dont); {
13137         // MACC(Ra, Rb, t0, t1, t2);
13138         // Ra = *++Pa;
13139         // Rb = *--Pb;
13140         umulh(Rhi_ab, Ra, Rb);
13141         mul(Rlo_ab, Ra, Rb);
13142         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13143       } bind(dont);
13144     }
13145 
13146     void extra_step_squaring() {
13147       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
13148 
13149       // MACC(Rm, Rn, t0, t1, t2);
13150       // Rm = *++Pm;
13151       // Rn = *--Pn;
13152       umulh(Rhi_mn, Rm, Rn);
13153       mul(Rlo_mn, Rm, Rn);
13154       ldr(Rm, pre(Pm, wordSize));
13155       ldr(Rn, pre(Pn, -wordSize));
13156     }
13157 
13158     void post1_squaring() {
13159       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
13160 
13161       // *Pm = Rm = t0 * inv;
13162       mul(Rm, t0, inv);
13163       str(Rm, Address(Pm));
13164 
13165       // MACC(Rm, Rn, t0, t1, t2);
13166       // t0 = t1; t1 = t2; t2 = 0;
13167       umulh(Rhi_mn, Rm, Rn);
13168 
13169 #ifndef PRODUCT
13170       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
13171       {
13172         mul(Rlo_mn, Rm, Rn);
13173         add(Rlo_mn, t0, Rlo_mn);
13174         Label ok;
13175         cbz(Rlo_mn, ok); {
13176           stop("broken Montgomery multiply");
13177         } bind(ok);
13178       }
13179 #endif
13180       // We have very carefully set things up so that
13181       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
13182       // the lower half of Rm * Rn because we know the result already:
13183       // it must be -t0.  t0 + (-t0) must generate a carry iff
13184       // t0 != 0.  So, rather than do a mul and an adds we just set
13185       // the carry flag iff t0 is nonzero.
13186       //
13187       // mul(Rlo_mn, Rm, Rn);
13188       // adds(zr, t0, Rlo_mn);
13189       subs(zr, t0, 1); // Set carry iff t0 is nonzero
13190       adcs(t0, t1, Rhi_mn);
13191       adc(t1, t2, zr);
13192       mov(t2, zr);
13193     }
13194 
13195     void acc(Register Rhi, Register Rlo,
13196              Register t0, Register t1, Register t2) {
13197       adds(t0, t0, Rlo);
13198       adcs(t1, t1, Rhi);
13199       adc(t2, t2, zr);
13200     }
13201 
13202   public:
13203     /**
13204      * Fast Montgomery multiplication.  The derivation of the
13205      * algorithm is in A Cryptographic Library for the Motorola
13206      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
13207      *
13208      * Arguments:
13209      *
13210      * Inputs for multiplication:
13211      *   c_rarg0   - int array elements a
13212      *   c_rarg1   - int array elements b
13213      *   c_rarg2   - int array elements n (the modulus)
13214      *   c_rarg3   - int length
13215      *   c_rarg4   - int inv
13216      *   c_rarg5   - int array elements m (the result)
13217      *
13218      * Inputs for squaring:
13219      *   c_rarg0   - int array elements a
13220      *   c_rarg1   - int array elements n (the modulus)
13221      *   c_rarg2   - int length
13222      *   c_rarg3   - int inv
13223      *   c_rarg4   - int array elements m (the result)
13224      *
13225      */
13226     address generate_multiply() {
13227       Label argh, nothing;
13228 
13229       align(CodeEntryAlignment);
13230       address entry = pc();
13231 
13232       cbzw(Rlen, nothing);
13233 
13234       enter();
13235 
13236       // Make room.
13237       cmpw(Rlen, 512);
13238       br(Assembler::HI, argh);
13239       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
13240       andr(sp, Ra, -2 * wordSize);
13241 
13242       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
13243 
13244       {
13245         // Copy input args, reversing as we go.  We use Ra as a
13246         // temporary variable.
13247         reverse(Ra, Pa_base, Rlen, t0, t1);
13248         if (!_squaring)
13249           reverse(Ra, Pb_base, Rlen, t0, t1);
13250         reverse(Ra, Pn_base, Rlen, t0, t1);
13251       }
13252 
13253       // Push all call-saved registers and also Pm_base which we'll need
13254       // at the end.
13255       save_regs();
13256 
13257 #ifndef PRODUCT
13258       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
13259       {
13260         ldr(Rn, Address(Pn_base, 0));
13261         mul(Rlo_mn, Rn, inv);
13262         subs(zr, Rlo_mn, -1);
13263         Label ok;
13264         br(EQ, ok); {
13265           stop("broken inverse in Montgomery multiply");
13266         } bind(ok);
13267       }
13268 #endif
13269 
13270       mov(Pm_base, Ra);
13271 
13272       mov(t0, zr);
13273       mov(t1, zr);
13274       mov(t2, zr);
13275 
13276       block_comment("for (int i = 0; i < len; i++) {");
13277       mov(Ri, zr); {
13278         Label loop, end;
13279         cmpw(Ri, Rlen);
13280         br(Assembler::GE, end);
13281 
13282         bind(loop);
13283         pre1(Ri);
13284 
13285         block_comment("  for (j = i; j; j--) {"); {
13286           movw(Rj, Ri);
13287           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
13288         } block_comment("  } // j");
13289 
13290         post1();
13291         addw(Ri, Ri, 1);
13292         cmpw(Ri, Rlen);
13293         br(Assembler::LT, loop);
13294         bind(end);
13295         block_comment("} // i");
13296       }
13297 
13298       block_comment("for (int i = len; i < 2*len; i++) {");
13299       mov(Ri, Rlen); {
13300         Label loop, end;
13301         cmpw(Ri, Rlen, Assembler::LSL, 1);
13302         br(Assembler::GE, end);
13303 
13304         bind(loop);
13305         pre2(Ri, Rlen);
13306 
13307         block_comment("  for (j = len*2-i-1; j; j--) {"); {
13308           lslw(Rj, Rlen, 1);
13309           subw(Rj, Rj, Ri);
13310           subw(Rj, Rj, 1);
13311           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
13312         } block_comment("  } // j");
13313 
13314         post2(Ri, Rlen);
13315         addw(Ri, Ri, 1);
13316         cmpw(Ri, Rlen, Assembler::LSL, 1);
13317         br(Assembler::LT, loop);
13318         bind(end);
13319       }
13320       block_comment("} // i");
13321 
13322       normalize(Rlen);
13323 
13324       mov(Ra, Pm_base);  // Save Pm_base in Ra
13325       restore_regs();  // Restore caller's Pm_base
13326 
13327       // Copy our result into caller's Pm_base
13328       reverse(Pm_base, Ra, Rlen, t0, t1);
13329 
13330       leave();
13331       bind(nothing);
13332       ret(lr);
13333 
13334       // handler for error case
13335       bind(argh);
13336       stop("MontgomeryMultiply total_allocation must be <= 8192");
13337 
13338       return entry;
13339     }
13340     // In C, approximately:
13341 
13342     // void
13343     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
13344     //                     julong Pn_base[], julong Pm_base[],
13345     //                     julong inv, int len) {
13346     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
13347     //   julong *Pa, *Pb, *Pn, *Pm;
13348     //   julong Ra, Rb, Rn, Rm;
13349 
13350     //   int i;
13351 
13352     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
13353 
13354     //   for (i = 0; i < len; i++) {
13355     //     int j;
13356 
13357     //     Pa = Pa_base;
13358     //     Pb = Pb_base + i;
13359     //     Pm = Pm_base;
13360     //     Pn = Pn_base + i;
13361 
13362     //     Ra = *Pa;
13363     //     Rb = *Pb;
13364     //     Rm = *Pm;
13365     //     Rn = *Pn;
13366 
13367     //     int iters = i;
13368     //     for (j = 0; iters--; j++) {
13369     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
13370     //       MACC(Ra, Rb, t0, t1, t2);
13371     //       Ra = *++Pa;
13372     //       Rb = *--Pb;
13373     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13374     //       MACC(Rm, Rn, t0, t1, t2);
13375     //       Rm = *++Pm;
13376     //       Rn = *--Pn;
13377     //     }
13378 
13379     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
13380     //     MACC(Ra, Rb, t0, t1, t2);
13381     //     *Pm = Rm = t0 * inv;
13382     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
13383     //     MACC(Rm, Rn, t0, t1, t2);
13384 
13385     //     assert(t0 == 0, "broken Montgomery multiply");
13386 
13387     //     t0 = t1; t1 = t2; t2 = 0;
13388     //   }
13389 
13390     //   for (i = len; i < 2*len; i++) {
13391     //     int j;
13392 
13393     //     Pa = Pa_base + i-len;
13394     //     Pb = Pb_base + len;
13395     //     Pm = Pm_base + i-len;
13396     //     Pn = Pn_base + len;
13397 
13398     //     Ra = *++Pa;
13399     //     Rb = *--Pb;
13400     //     Rm = *++Pm;
13401     //     Rn = *--Pn;
13402 
13403     //     int iters = len*2-i-1;
13404     //     for (j = i-len+1; iters--; j++) {
13405     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
13406     //       MACC(Ra, Rb, t0, t1, t2);
13407     //       Ra = *++Pa;
13408     //       Rb = *--Pb;
13409     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13410     //       MACC(Rm, Rn, t0, t1, t2);
13411     //       Rm = *++Pm;
13412     //       Rn = *--Pn;
13413     //     }
13414 
13415     //     Pm_base[i-len] = t0;
13416     //     t0 = t1; t1 = t2; t2 = 0;
13417     //   }
13418 
13419     //   while (t0)
13420     //     t0 = sub(Pm_base, Pn_base, t0, len);
13421     // }
13422 
13423     /**
13424      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
13425      * multiplies than Montgomery multiplication so it should be up to
13426      * 25% faster.  However, its loop control is more complex and it
13427      * may actually run slower on some machines.
13428      *
13429      * Arguments:
13430      *
13431      * Inputs:
13432      *   c_rarg0   - int array elements a
13433      *   c_rarg1   - int array elements n (the modulus)
13434      *   c_rarg2   - int length
13435      *   c_rarg3   - int inv
13436      *   c_rarg4   - int array elements m (the result)
13437      *
13438      */
13439     address generate_square() {
13440       Label argh;
13441 
13442       align(CodeEntryAlignment);
13443       address entry = pc();
13444 
13445       enter();
13446 
13447       // Make room.
13448       cmpw(Rlen, 512);
13449       br(Assembler::HI, argh);
13450       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
13451       andr(sp, Ra, -2 * wordSize);
13452 
13453       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
13454 
13455       {
13456         // Copy input args, reversing as we go.  We use Ra as a
13457         // temporary variable.
13458         reverse(Ra, Pa_base, Rlen, t0, t1);
13459         reverse(Ra, Pn_base, Rlen, t0, t1);
13460       }
13461 
13462       // Push all call-saved registers and also Pm_base which we'll need
13463       // at the end.
13464       save_regs();
13465 
13466       mov(Pm_base, Ra);
13467 
13468       mov(t0, zr);
13469       mov(t1, zr);
13470       mov(t2, zr);
13471 
13472       block_comment("for (int i = 0; i < len; i++) {");
13473       mov(Ri, zr); {
13474         Label loop, end;
13475         bind(loop);
13476         cmp(Ri, Rlen);
13477         br(Assembler::GE, end);
13478 
13479         pre1(Ri);
13480 
13481         block_comment("for (j = (i+1)/2; j; j--) {"); {
13482           add(Rj, Ri, 1);
13483           lsr(Rj, Rj, 1);
13484           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
13485         } block_comment("  } // j");
13486 
13487         last_squaring(Ri);
13488 
13489         block_comment("  for (j = i/2; j; j--) {"); {
13490           lsr(Rj, Ri, 1);
13491           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
13492         } block_comment("  } // j");
13493 
13494         post1_squaring();
13495         add(Ri, Ri, 1);
13496         cmp(Ri, Rlen);
13497         br(Assembler::LT, loop);
13498 
13499         bind(end);
13500         block_comment("} // i");
13501       }
13502 
13503       block_comment("for (int i = len; i < 2*len; i++) {");
13504       mov(Ri, Rlen); {
13505         Label loop, end;
13506         bind(loop);
13507         cmp(Ri, Rlen, Assembler::LSL, 1);
13508         br(Assembler::GE, end);
13509 
13510         pre2(Ri, Rlen);
13511 
13512         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
13513           lsl(Rj, Rlen, 1);
13514           sub(Rj, Rj, Ri);
13515           sub(Rj, Rj, 1);
13516           lsr(Rj, Rj, 1);
13517           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
13518         } block_comment("  } // j");
13519 
13520         last_squaring(Ri);
13521 
13522         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
13523           lsl(Rj, Rlen, 1);
13524           sub(Rj, Rj, Ri);
13525           lsr(Rj, Rj, 1);
13526           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
13527         } block_comment("  } // j");
13528 
13529         post2(Ri, Rlen);
13530         add(Ri, Ri, 1);
13531         cmp(Ri, Rlen, Assembler::LSL, 1);
13532 
13533         br(Assembler::LT, loop);
13534         bind(end);
13535         block_comment("} // i");
13536       }
13537 
13538       normalize(Rlen);
13539 
13540       mov(Ra, Pm_base);  // Save Pm_base in Ra
13541       restore_regs();  // Restore caller's Pm_base
13542 
13543       // Copy our result into caller's Pm_base
13544       reverse(Pm_base, Ra, Rlen, t0, t1);
13545 
13546       leave();
13547       ret(lr);
13548 
13549       // handler for error case
13550       bind(argh);
13551       stop("MontgomeryMultiply total_allocation must be <= 8192");
13552 
13553       return entry;
13554     }
13555     // In C, approximately:
13556 
13557     // void
13558     // montgomery_square(julong Pa_base[], julong Pn_base[],
13559     //                   julong Pm_base[], julong inv, int len) {
13560     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
13561     //   julong *Pa, *Pb, *Pn, *Pm;
13562     //   julong Ra, Rb, Rn, Rm;
13563 
13564     //   int i;
13565 
13566     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
13567 
13568     //   for (i = 0; i < len; i++) {
13569     //     int j;
13570 
13571     //     Pa = Pa_base;
13572     //     Pb = Pa_base + i;
13573     //     Pm = Pm_base;
13574     //     Pn = Pn_base + i;
13575 
13576     //     Ra = *Pa;
13577     //     Rb = *Pb;
13578     //     Rm = *Pm;
13579     //     Rn = *Pn;
13580 
13581     //     int iters = (i+1)/2;
13582     //     for (j = 0; iters--; j++) {
13583     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
13584     //       MACC2(Ra, Rb, t0, t1, t2);
13585     //       Ra = *++Pa;
13586     //       Rb = *--Pb;
13587     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13588     //       MACC(Rm, Rn, t0, t1, t2);
13589     //       Rm = *++Pm;
13590     //       Rn = *--Pn;
13591     //     }
13592     //     if ((i & 1) == 0) {
13593     //       assert(Ra == Pa_base[j], "must be");
13594     //       MACC(Ra, Ra, t0, t1, t2);
13595     //     }
13596     //     iters = i/2;
13597     //     assert(iters == i-j, "must be");
13598     //     for (; iters--; j++) {
13599     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13600     //       MACC(Rm, Rn, t0, t1, t2);
13601     //       Rm = *++Pm;
13602     //       Rn = *--Pn;
13603     //     }
13604 
13605     //     *Pm = Rm = t0 * inv;
13606     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
13607     //     MACC(Rm, Rn, t0, t1, t2);
13608 
13609     //     assert(t0 == 0, "broken Montgomery multiply");
13610 
13611     //     t0 = t1; t1 = t2; t2 = 0;
13612     //   }
13613 
13614     //   for (i = len; i < 2*len; i++) {
13615     //     int start = i-len+1;
13616     //     int end = start + (len - start)/2;
13617     //     int j;
13618 
13619     //     Pa = Pa_base + i-len;
13620     //     Pb = Pa_base + len;
13621     //     Pm = Pm_base + i-len;
13622     //     Pn = Pn_base + len;
13623 
13624     //     Ra = *++Pa;
13625     //     Rb = *--Pb;
13626     //     Rm = *++Pm;
13627     //     Rn = *--Pn;
13628 
13629     //     int iters = (2*len-i-1)/2;
13630     //     assert(iters == end-start, "must be");
13631     //     for (j = start; iters--; j++) {
13632     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
13633     //       MACC2(Ra, Rb, t0, t1, t2);
13634     //       Ra = *++Pa;
13635     //       Rb = *--Pb;
13636     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13637     //       MACC(Rm, Rn, t0, t1, t2);
13638     //       Rm = *++Pm;
13639     //       Rn = *--Pn;
13640     //     }
13641     //     if ((i & 1) == 0) {
13642     //       assert(Ra == Pa_base[j], "must be");
13643     //       MACC(Ra, Ra, t0, t1, t2);
13644     //     }
13645     //     iters =  (2*len-i)/2;
13646     //     assert(iters == len-j, "must be");
13647     //     for (; iters--; j++) {
13648     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13649     //       MACC(Rm, Rn, t0, t1, t2);
13650     //       Rm = *++Pm;
13651     //       Rn = *--Pn;
13652     //     }
13653     //     Pm_base[i-len] = t0;
13654     //     t0 = t1; t1 = t2; t2 = 0;
13655     //   }
13656 
13657     //   while (t0)
13658     //     t0 = sub(Pm_base, Pn_base, t0, len);
13659     // }
13660   };
13661 
13662   // Initialization
13663   void generate_preuniverse_stubs() {
13664     // preuniverse stubs are not needed for aarch64
13665   }
13666 
13667   void generate_initial_stubs() {
13668     // Generate initial stubs and initializes the entry points
13669 
13670     // entry points that exist in all platforms Note: This is code
13671     // that could be shared among different platforms - however the
13672     // benefit seems to be smaller than the disadvantage of having a
13673     // much more complicated generator structure. See also comment in
13674     // stubRoutines.hpp.
13675 
13676     StubRoutines::_forward_exception_entry = generate_forward_exception();
13677 
13678     StubRoutines::_call_stub_entry =
13679       generate_call_stub(StubRoutines::_call_stub_return_address);
13680 
13681     // is referenced by megamorphic call
13682     StubRoutines::_catch_exception_entry = generate_catch_exception();
13683 
13684     // Initialize table for copy memory (arraycopy) check.
13685     if (UnsafeMemoryAccess::_table == nullptr) {
13686       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
13687     }
13688 
13689     if (UseCRC32Intrinsics) {
13690       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
13691     }
13692 
13693     if (UseCRC32CIntrinsics) {
13694       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
13695     }
13696 
13697     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
13698       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
13699     }
13700 
13701     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
13702       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
13703     }
13704 
13705     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
13706         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
13707       StubRoutines::_hf2f = generate_float16ToFloat();
13708       StubRoutines::_f2hf = generate_floatToFloat16();
13709     }
13710   }
13711 
13712   void generate_continuation_stubs() {
13713     // Continuation stubs:
13714     StubRoutines::_cont_thaw          = generate_cont_thaw();
13715     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
13716     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
13717     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
13718   }
13719 
13720   void generate_final_stubs() {
13721     // support for verify_oop (must happen after universe_init)
13722     if (VerifyOops) {
13723       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
13724     }
13725 
13726     // arraycopy stubs used by compilers
13727     generate_arraycopy_stubs();
13728 
13729     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
13730 
13731     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
13732 
13733     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
13734     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
13735 
13736 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
13737 
13738     generate_atomic_entry_points();
13739 
13740 #endif // LINUX
13741 
13742 #ifdef COMPILER2
13743     if (UseSecondarySupersTable) {
13744       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
13745       if (! InlineSecondarySupersTest) {
13746         generate_lookup_secondary_supers_table_stub();
13747       }
13748     }
13749 #endif
13750 
13751     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
13752       StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
13753     }
13754 
13755     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
13756   }
13757 
13758   void generate_compiler_stubs() {
13759 #ifdef COMPILER2
13760 
13761     if (UseSVE == 0) {
13762       generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
13763     }
13764 
13765     // array equals stub for large arrays.
13766     if (!UseSimpleArrayEquals) {
13767       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
13768     }
13769 
13770     // arrays_hascode stub for large arrays.
13771     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
13772     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
13773     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
13774     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
13775     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
13776 
13777     // byte_array_inflate stub for large arrays.
13778     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
13779 
13780     // countPositives stub for large arrays.
13781     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
13782 
13783     generate_compare_long_strings();
13784 
13785     generate_string_indexof_stubs();
13786 
13787     if (UseMultiplyToLenIntrinsic) {
13788       StubRoutines::_multiplyToLen = generate_multiplyToLen();
13789     }
13790 
13791     if (UseSquareToLenIntrinsic) {
13792       StubRoutines::_squareToLen = generate_squareToLen();
13793     }
13794 
13795     if (UseMulAddIntrinsic) {
13796       StubRoutines::_mulAdd = generate_mulAdd();
13797     }
13798 
13799     if (UseSIMDForBigIntegerShiftIntrinsics) {
13800       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
13801       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
13802     }
13803 
13804     if (UseMontgomeryMultiplyIntrinsic) {
13805       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
13806       address start = load_archive_data(stub_id);
13807       if (start == nullptr) {
13808         // we have to generate it
13809         StubCodeMark mark(this, stub_id);
13810         MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
13811         start = g.generate_multiply();
13812         // record the stub start and end
13813         store_archive_data(stub_id, start, _masm->pc());
13814       }
13815       StubRoutines::_montgomeryMultiply = start;
13816     }
13817 
13818     if (UseMontgomerySquareIntrinsic) {
13819       StubId stub_id = StubId::stubgen_montgomerySquare_id;
13820       address start = load_archive_data(stub_id);
13821       if (start == nullptr) {
13822         // we have to generate it
13823         StubCodeMark mark(this, stub_id);
13824         MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
13825         // We use generate_multiply() rather than generate_square()
13826         // because it's faster for the sizes of modulus we care about.
13827         start = g.generate_multiply();
13828         // record the stub start and end
13829         store_archive_data(stub_id, start, _masm->pc());
13830       }
13831       StubRoutines::_montgomerySquare = start;
13832     }
13833 
13834     if (UseChaCha20Intrinsics) {
13835       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
13836     }
13837 
13838     if (UseIntPolyIntrinsics) {
13839       StubRoutines::_intpoly_montgomeryMult_P256 = generate_intpoly_montgomeryMult_P256();
13840       StubRoutines::_intpoly_assign = generate_intpoly_assign();
13841     }
13842 
13843     if (UseKyberIntrinsics) {
13844       StubRoutines::_kyberNtt = generate_kyberNtt();
13845       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
13846       StubRoutines::_kyberNttMult = generate_kyberNttMult();
13847       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
13848       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
13849       StubRoutines::_kyber12To16 = generate_kyber12To16();
13850       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
13851     }
13852 
13853     if (UseDilithiumIntrinsics) {
13854       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
13855       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
13856       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
13857       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
13858       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
13859     }
13860 
13861     if (UseBASE64Intrinsics) {
13862         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
13863         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
13864     }
13865 
13866     // data cache line writeback
13867     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
13868     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
13869 
13870     if (UseAESIntrinsics) {
13871       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
13872       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
13873       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
13874       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
13875       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
13876     }
13877     if (UseGHASHIntrinsics) {
13878       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
13879       StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
13880       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
13881     }
13882     if (UseAESIntrinsics && UseGHASHIntrinsics) {
13883       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
13884     }
13885 
13886     if (UseMD5Intrinsics) {
13887       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
13888       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
13889     }
13890     if (UseSHA1Intrinsics) {
13891       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
13892       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
13893     }
13894     if (UseSHA256Intrinsics) {
13895       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
13896       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
13897     }
13898     if (UseSHA512Intrinsics) {
13899       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
13900       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
13901     }
13902     if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
13903       StubRoutines::_double_keccak         = generate_double_keccak();
13904       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
13905       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
13906     } else if (UseSHA3Intrinsics) {
13907       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
13908       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
13909     }
13910 
13911     if (UsePoly1305Intrinsics) {
13912       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
13913     }
13914 
13915     // The difference between AArch64 vs. x86_64 intrinsics implementation
13916     // include the lack of square() intrinsics; usage caused a 3.3% performance
13917     // degradation due to the efficiencies of the symmetric squaring shape in
13918     // Java vs. the inefficiencies of the leaf calls and the additional cycles
13919     // required for 64 bit multiplication in AArch64.
13920     if (UseIntPoly25519Intrinsics) {
13921       StubRoutines::_intpoly_mult_25519 = generate_intpoly_mult_25519();
13922     }
13923 
13924     // generate Adler32 intrinsics code
13925     if (UseAdler32Intrinsics) {
13926       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
13927     }
13928 
13929 #endif // COMPILER2
13930   }
13931 
13932  public:
13933   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
13934     switch(blob_id) {
13935     case BlobId::stubgen_preuniverse_id:
13936       generate_preuniverse_stubs();
13937       break;
13938     case BlobId::stubgen_initial_id:
13939       generate_initial_stubs();
13940       break;
13941      case BlobId::stubgen_continuation_id:
13942       generate_continuation_stubs();
13943       break;
13944     case BlobId::stubgen_compiler_id:
13945       generate_compiler_stubs();
13946       break;
13947     case BlobId::stubgen_final_id:
13948       generate_final_stubs();
13949       break;
13950     default:
13951       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
13952       break;
13953     };
13954   }
13955 
13956 #if INCLUDE_CDS
13957   static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
13958     // external data defined in this file
13959 #define ADD(addr) external_addresses.append((address)(addr));
13960     ADD(_sha256_round_consts);
13961     ADD(_sha512_round_consts);
13962     ADD(_sha3_round_consts);
13963     ADD(_double_keccak_round_consts);
13964     ADD(_modulus_P256);
13965     ADD(_encodeBlock_toBase64);
13966     ADD(_encodeBlock_toBase64URL);
13967     ADD(_decodeBlock_fromBase64ForNoSIMD);
13968     ADD(_decodeBlock_fromBase64URLForNoSIMD);
13969     ADD(_decodeBlock_fromBase64ForSIMD);
13970     ADD(_decodeBlock_fromBase64URLForSIMD);
13971 #undef ADD
13972   }
13973 #endif // INCLUDE_CDS
13974 }; // end class declaration
13975 
13976 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
13977   StubGenerator g(code, blob_id, stub_data);
13978 }
13979 
13980 #if INCLUDE_CDS
13981 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
13982   StubGenerator::init_AOTAddressTable(addresses);
13983 }
13984 #endif // INCLUDE_CDS
13985 
13986 #if defined (LINUX)
13987 
13988 // Define pointers to atomic stubs and initialize them to point to the
13989 // code in atomic_aarch64.S.
13990 
13991 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
13992   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
13993     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
13994   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
13995     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
13996 
13997 DEFAULT_ATOMIC_OP(fetch_add, 4, )
13998 DEFAULT_ATOMIC_OP(fetch_add, 8, )
13999 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
14000 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
14001 DEFAULT_ATOMIC_OP(xchg, 4, )
14002 DEFAULT_ATOMIC_OP(xchg, 8, )
14003 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
14004 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
14005 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
14006 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
14007 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
14008 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
14009 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
14010 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
14011 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
14012 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
14013 
14014 #undef DEFAULT_ATOMIC_OP
14015 
14016 #endif // LINUX