1 /*
    2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Constant data definitions
   83 
   84 static const uint32_t _sha256_round_consts[64] = {
   85   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   86   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   87   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
   88   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
   89   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
   90   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
   91   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
   92   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
   93   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
   94   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
   95   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
   96   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
   97   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
   98   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
   99   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  100   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  101 };
  102 
  103 static const uint64_t _sha512_round_consts[80] = {
  104   0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
  105   0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
  106   0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
  107   0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
  108   0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
  109   0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
  110   0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
  111   0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
  112   0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
  113   0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
  114   0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
  115   0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
  116   0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
  117   0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
  118   0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
  119   0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
  120   0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
  121   0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
  122   0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
  123   0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
  124   0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
  125   0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
  126   0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
  127   0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
  128   0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
  129   0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
  130   0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
  131 };
  132 
  133 static const uint64_t _sha3_round_consts[24] = {
  134   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  135   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  136   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  137   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  138   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  139   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  140   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  141   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  142 };
  143 
  144 static const uint64_t _double_keccak_round_consts[24] = {
  145   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  146   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  147   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  148   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  149   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  150   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  151   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  152   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  153 };
  154 
  155 //Omit 3rd limb of modulus since it is 0
  156 static const int64_t _modulus_P256[5] = {
  157   0x000fffffffffffffL, 0x00000fffffffffffL,
  158   0x0000001000000000L, 0x0000ffffffff0000L
  159 };
  160 
  161 static const char _encodeBlock_toBase64[64] = {
  162   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  163   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  164   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  165   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  166   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
  167 };
  168 
  169 static const char _encodeBlock_toBase64URL[64] = {
  170   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  171   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  172   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  173   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  174   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
  175 };
  176 
  177 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
  178 // except the trailing character '=' is also treated illegal value in this intrinsic. That
  179 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
  180 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
  181   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  182   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  183   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  184   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  185   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  186   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
  187   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  188   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  189   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  190   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  191   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  192   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  193   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  194   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  195   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  196   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  197 };
  198 
  199 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
  200   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  201   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  202   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  203   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  204   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  205   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
  206   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  207   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  208   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  209   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  210   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  211   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  212   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  213   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  214   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  215   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  216 };
  217 
  218 // A legal value of base64 code is in range [0, 127].  We need two lookups
  219 // with tbl/tbx and combine them to get the decode data. The 1st table vector
  220 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
  221 // table vector lookup use tbx, out of range indices are unchanged in
  222 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
  223 // The value of index 64 is set to 0, so that we know that we already get the
  224 // decoded data with the 1st lookup.
  225 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
  226   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  227   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  228   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  229   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  230   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  231   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  232   255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  233   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  234 };
  235 
  236 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
  237   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  238   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  239   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  240   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  241   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  242   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  243   63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  244   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  245 };
  246 
  247 
  248 // Stub Code definitions
  249 
  250 class StubGenerator: public StubCodeGenerator {
  251  private:
  252 
  253 #ifdef PRODUCT
  254 #define inc_counter_np(counter) ((void)0)
  255 #else
  256   void inc_counter_np_(uint& counter) {
  257     __ incrementw(ExternalAddress((address)&counter));
  258   }
  259 #define inc_counter_np(counter) \
  260   BLOCK_COMMENT("inc_counter " #counter); \
  261   inc_counter_np_(counter);
  262 #endif
  263 
  264   // Call stubs are used to call Java from C
  265   //
  266   // Arguments:
  267   //    c_rarg0:   call wrapper address                   address
  268   //    c_rarg1:   result                                 address
  269   //    c_rarg2:   result type                            BasicType
  270   //    c_rarg3:   method                                 Method*
  271   //    c_rarg4:   (interpreter) entry point              address
  272   //    c_rarg5:   parameters                             intptr_t*
  273   //    c_rarg6:   parameter size (in words)              int
  274   //    c_rarg7:   thread                                 Thread*
  275   //
  276   // There is no return from the stub itself as any Java result
  277   // is written to result
  278   //
  279   // we save r30 (lr) as the return PC at the base of the frame and
  280   // link r29 (fp) below it as the frame pointer installing sp (r31)
  281   // into fp.
  282   //
  283   // we save r0-r7, which accounts for all the c arguments.
  284   //
  285   // TODO: strictly do we need to save them all? they are treated as
  286   // volatile by C so could we omit saving the ones we are going to
  287   // place in global registers (thread? method?) or those we only use
  288   // during setup of the Java call?
  289   //
  290   // we don't need to save r8 which C uses as an indirect result location
  291   // return register.
  292   //
  293   // we don't need to save r9-r15 which both C and Java treat as
  294   // volatile
  295   //
  296   // we don't need to save r16-18 because Java does not use them
  297   //
  298   // we save r19-r28 which Java uses as scratch registers and C
  299   // expects to be callee-save
  300   //
  301   // we save the bottom 64 bits of each value stored in v8-v15; it is
  302   // the responsibility of the caller to preserve larger values.
  303   //
  304   // so the stub frame looks like this when we enter Java code
  305   //
  306   //     [ return_from_Java     ] <--- sp
  307   //     [ argument word n      ]
  308   //      ...
  309   // -29 [ argument word 1      ]
  310   // -28 [ saved Floating-point Control Register ]
  311   // -26 [ saved v15            ] <--- sp_after_call
  312   // -25 [ saved v14            ]
  313   // -24 [ saved v13            ]
  314   // -23 [ saved v12            ]
  315   // -22 [ saved v11            ]
  316   // -21 [ saved v10            ]
  317   // -20 [ saved v9             ]
  318   // -19 [ saved v8             ]
  319   // -18 [ saved r28            ]
  320   // -17 [ saved r27            ]
  321   // -16 [ saved r26            ]
  322   // -15 [ saved r25            ]
  323   // -14 [ saved r24            ]
  324   // -13 [ saved r23            ]
  325   // -12 [ saved r22            ]
  326   // -11 [ saved r21            ]
  327   // -10 [ saved r20            ]
  328   //  -9 [ saved r19            ]
  329   //  -8 [ call wrapper    (r0) ]
  330   //  -7 [ result          (r1) ]
  331   //  -6 [ result type     (r2) ]
  332   //  -5 [ method          (r3) ]
  333   //  -4 [ entry point     (r4) ]
  334   //  -3 [ parameters      (r5) ]
  335   //  -2 [ parameter size  (r6) ]
  336   //  -1 [ thread (r7)          ]
  337   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  338   //   1 [ saved lr       (r30) ]
  339 
  340   // Call stub stack layout word offsets from fp
  341   enum call_stub_layout {
  342     sp_after_call_off  = -28,
  343 
  344     fpcr_off           = sp_after_call_off,
  345     d15_off            = -26,
  346     d13_off            = -24,
  347     d11_off            = -22,
  348     d9_off             = -20,
  349 
  350     r28_off            = -18,
  351     r26_off            = -16,
  352     r24_off            = -14,
  353     r22_off            = -12,
  354     r20_off            = -10,
  355     call_wrapper_off   =  -8,
  356     result_off         =  -7,
  357     result_type_off    =  -6,
  358     method_off         =  -5,
  359     entry_point_off    =  -4,
  360     parameter_size_off =  -2,
  361     thread_off         =  -1,
  362     fp_f               =   0,
  363     retaddr_off        =   1,
  364   };
  365 
  366   address generate_call_stub(address& return_address) {
  367     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  368            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  369            "adjust this code");
  370 
  371     StubId stub_id = StubId::stubgen_call_stub_id;
  372     GrowableArray<address> entries;
  373     int entry_count = StubInfo::entry_count(stub_id);
  374     assert(entry_count == 2, "sanity check");
  375     address start = load_archive_data(stub_id, &entries);
  376     if (start != nullptr) {
  377       assert(entries.length() == 1, "expected 1 extra entry");
  378       return_address = entries.at(0);
  379       return start;
  380     }
  381     StubCodeMark mark(this, stub_id);
  382     start = __ pc();
  383 
  384     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  385 
  386     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  387     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  388     const Address result        (rfp, result_off         * wordSize);
  389     const Address result_type   (rfp, result_type_off    * wordSize);
  390     const Address method        (rfp, method_off         * wordSize);
  391     const Address entry_point   (rfp, entry_point_off    * wordSize);
  392     const Address parameter_size(rfp, parameter_size_off * wordSize);
  393 
  394     const Address thread        (rfp, thread_off         * wordSize);
  395 
  396     const Address d15_save      (rfp, d15_off * wordSize);
  397     const Address d13_save      (rfp, d13_off * wordSize);
  398     const Address d11_save      (rfp, d11_off * wordSize);
  399     const Address d9_save       (rfp, d9_off * wordSize);
  400 
  401     const Address r28_save      (rfp, r28_off * wordSize);
  402     const Address r26_save      (rfp, r26_off * wordSize);
  403     const Address r24_save      (rfp, r24_off * wordSize);
  404     const Address r22_save      (rfp, r22_off * wordSize);
  405     const Address r20_save      (rfp, r20_off * wordSize);
  406 
  407     // stub code
  408 
  409     address aarch64_entry = __ pc();
  410 
  411     // set up frame and move sp to end of save area
  412     __ enter();
  413     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  414 
  415     // save register parameters and Java scratch/global registers
  416     // n.b. we save thread even though it gets installed in
  417     // rthread because we want to sanity check rthread later
  418     __ str(c_rarg7,  thread);
  419     __ strw(c_rarg6, parameter_size);
  420     __ stp(c_rarg4, c_rarg5,  entry_point);
  421     __ stp(c_rarg2, c_rarg3,  result_type);
  422     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  423 
  424     __ stp(r20, r19,   r20_save);
  425     __ stp(r22, r21,   r22_save);
  426     __ stp(r24, r23,   r24_save);
  427     __ stp(r26, r25,   r26_save);
  428     __ stp(r28, r27,   r28_save);
  429 
  430     __ stpd(v9,  v8,   d9_save);
  431     __ stpd(v11, v10,  d11_save);
  432     __ stpd(v13, v12,  d13_save);
  433     __ stpd(v15, v14,  d15_save);
  434 
  435     __ get_fpcr(rscratch1);
  436     __ str(rscratch1, fpcr_save);
  437     // Set FPCR to the state we need. We do want Round to Nearest. We
  438     // don't want non-IEEE rounding modes or floating-point traps.
  439     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  440     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  441     __ set_fpcr(rscratch1);
  442 
  443     // install Java thread in global register now we have saved
  444     // whatever value it held
  445     __ mov(rthread, c_rarg7);
  446     // And method
  447     __ mov(rmethod, c_rarg3);
  448 
  449     // set up the heapbase register
  450     __ reinit_heapbase();
  451 
  452 #ifdef ASSERT
  453     // make sure we have no pending exceptions
  454     {
  455       Label L;
  456       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  457       __ cmp(rscratch1, (u1)NULL_WORD);
  458       __ br(Assembler::EQ, L);
  459       __ stop("StubRoutines::call_stub: entered with pending exception");
  460       __ BIND(L);
  461     }
  462 #endif
  463     // pass parameters if any
  464     __ mov(esp, sp);
  465     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  466     __ andr(sp, rscratch1, -2 * wordSize);
  467 
  468     BLOCK_COMMENT("pass parameters if any");
  469     Label parameters_done;
  470     // parameter count is still in c_rarg6
  471     // and parameter pointer identifying param 1 is in c_rarg5
  472     __ cbzw(c_rarg6, parameters_done);
  473 
  474     address loop = __ pc();
  475     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  476     __ subsw(c_rarg6, c_rarg6, 1);
  477     __ push(rscratch1);
  478     __ br(Assembler::GT, loop);
  479 
  480     __ BIND(parameters_done);
  481 
  482     // call Java entry -- passing methdoOop, and current sp
  483     //      rmethod: Method*
  484     //      r19_sender_sp: sender sp
  485     BLOCK_COMMENT("call Java function");
  486     __ mov(r19_sender_sp, sp);
  487     __ blr(c_rarg4);
  488 
  489     // we do this here because the notify will already have been done
  490     // if we get to the next instruction via an exception
  491     //
  492     // n.b. adding this instruction here affects the calculation of
  493     // whether or not a routine returns to the call stub (used when
  494     // doing stack walks) since the normal test is to check the return
  495     // pc against the address saved below. so we may need to allow for
  496     // this extra instruction in the check.
  497 
  498     // save current address for use by exception handling code
  499 
  500     return_address = __ pc();
  501     entries.append(return_address);
  502 
  503     // store result depending on type (everything that is not
  504     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  505     // n.b. this assumes Java returns an integral result in r0
  506     // and a floating result in j_farg0
  507     // All of j_rargN may be used to return inline type fields so be careful
  508     // not to clobber those.
  509     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  510     // assignment of Rresult below.
  511     Register Rresult = r14, Rresult_type = r15;
  512     __ ldr(Rresult, result);
  513     Label is_long, is_float, is_double, check_prim, exit;
  514     __ ldr(Rresult_type, result_type);
  515     __ cmp(Rresult_type, (u1)T_OBJECT);
  516     __ br(Assembler::EQ, check_prim);
  517     __ cmp(Rresult_type, (u1)T_LONG);
  518     __ br(Assembler::EQ, is_long);
  519     __ cmp(Rresult_type, (u1)T_FLOAT);
  520     __ br(Assembler::EQ, is_float);
  521     __ cmp(Rresult_type, (u1)T_DOUBLE);
  522     __ br(Assembler::EQ, is_double);
  523 
  524     // handle T_INT case
  525     __ strw(r0, Address(Rresult));
  526 
  527     __ BIND(exit);
  528 
  529     // pop parameters
  530     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  531 
  532 #ifdef ASSERT
  533     // verify that threads correspond
  534     {
  535       Label L, S;
  536       __ ldr(rscratch1, thread);
  537       __ cmp(rthread, rscratch1);
  538       __ br(Assembler::NE, S);
  539       __ get_thread(rscratch1);
  540       __ cmp(rthread, rscratch1);
  541       __ br(Assembler::EQ, L);
  542       __ BIND(S);
  543       __ stop("StubRoutines::call_stub: threads must correspond");
  544       __ BIND(L);
  545     }
  546 #endif
  547 
  548     __ pop_cont_fastpath(rthread);
  549 
  550     // restore callee-save registers
  551     __ ldpd(v15, v14,  d15_save);
  552     __ ldpd(v13, v12,  d13_save);
  553     __ ldpd(v11, v10,  d11_save);
  554     __ ldpd(v9,  v8,   d9_save);
  555 
  556     __ ldp(r28, r27,   r28_save);
  557     __ ldp(r26, r25,   r26_save);
  558     __ ldp(r24, r23,   r24_save);
  559     __ ldp(r22, r21,   r22_save);
  560     __ ldp(r20, r19,   r20_save);
  561 
  562     // restore fpcr
  563     __ ldr(rscratch1,  fpcr_save);
  564     __ set_fpcr(rscratch1);
  565 
  566     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  567     __ ldrw(c_rarg2, result_type);
  568     __ ldr(c_rarg3,  method);
  569     __ ldp(c_rarg4, c_rarg5,  entry_point);
  570     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  571 
  572     // leave frame and return to caller
  573     __ leave();
  574     __ ret(lr);
  575 
  576     // handle return types different from T_INT
  577     __ BIND(check_prim);
  578     if (InlineTypeReturnedAsFields) {
  579       // Check for scalarized return value
  580       __ tbz(r0, 0, is_long);
  581       // Load pack handler address
  582       __ andr(rscratch1, r0, -2);
  583       __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
  584       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  585       __ blr(rscratch1);
  586       __ b(exit);
  587     }
  588 
  589     __ BIND(is_long);
  590     __ str(r0, Address(Rresult, 0));
  591     __ br(Assembler::AL, exit);
  592 
  593     __ BIND(is_float);
  594     __ strs(j_farg0, Address(Rresult, 0));
  595     __ br(Assembler::AL, exit);
  596 
  597     __ BIND(is_double);
  598     __ strd(j_farg0, Address(Rresult, 0));
  599     __ br(Assembler::AL, exit);
  600 
  601     // record the stub entry and end plus the auxiliary entry
  602     store_archive_data(stub_id, start, __ pc(), &entries);
  603 
  604     return start;
  605   }
  606 
  607   // Return point for a Java call if there's an exception thrown in
  608   // Java code.  The exception is caught and transformed into a
  609   // pending exception stored in JavaThread that can be tested from
  610   // within the VM.
  611   //
  612   // Note: Usually the parameters are removed by the callee. In case
  613   // of an exception crossing an activation frame boundary, that is
  614   // not the case if the callee is compiled code => need to setup the
  615   // rsp.
  616   //
  617   // r0: exception oop
  618 
  619   address generate_catch_exception() {
  620     StubId stub_id = StubId::stubgen_catch_exception_id;
  621     int entry_count = StubInfo::entry_count(stub_id);
  622     assert(entry_count == 1, "sanity check");
  623     address start = load_archive_data(stub_id);
  624     if (start != nullptr) {
  625       return start;
  626     }
  627     StubCodeMark mark(this, stub_id);
  628     start = __ pc();
  629 
  630     // same as in generate_call_stub():
  631     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  632     const Address thread        (rfp, thread_off         * wordSize);
  633 
  634 #ifdef ASSERT
  635     // verify that threads correspond
  636     {
  637       Label L, S;
  638       __ ldr(rscratch1, thread);
  639       __ cmp(rthread, rscratch1);
  640       __ br(Assembler::NE, S);
  641       __ get_thread(rscratch1);
  642       __ cmp(rthread, rscratch1);
  643       __ br(Assembler::EQ, L);
  644       __ bind(S);
  645       __ stop("StubRoutines::catch_exception: threads must correspond");
  646       __ bind(L);
  647     }
  648 #endif
  649 
  650     // set pending exception
  651     __ verify_oop(r0);
  652 
  653     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  654     // special case -- add file name string to AOT address table
  655     address file = (address)AOTCodeCache::add_C_string(__FILE__);
  656     __ lea(rscratch1, ExternalAddress(file));
  657     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  658     __ movw(rscratch1, (int)__LINE__);
  659     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  660 
  661     // complete return to VM
  662     assert(StubRoutines::_call_stub_return_address != nullptr,
  663            "_call_stub_return_address must have been generated before");
  664     __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
  665 
  666     // record the stub entry and end
  667     store_archive_data(stub_id, start, __ pc());
  668 
  669     return start;
  670   }
  671 
  672   // Continuation point for runtime calls returning with a pending
  673   // exception.  The pending exception check happened in the runtime
  674   // or native call stub.  The pending exception in Thread is
  675   // converted into a Java-level exception.
  676   //
  677   // Contract with Java-level exception handlers:
  678   // r0: exception
  679   // r3: throwing pc
  680   //
  681   // NOTE: At entry of this stub, exception-pc must be in LR !!
  682 
  683   // NOTE: this is always used as a jump target within generated code
  684   // so it just needs to be generated code with no x86 prolog
  685 
  686   address generate_forward_exception() {
  687     StubId stub_id = StubId::stubgen_forward_exception_id;
  688     int entry_count = StubInfo::entry_count(stub_id);
  689     assert(entry_count == 1, "sanity check");
  690     address start = load_archive_data(stub_id);
  691     if (start != nullptr) {
  692       return start;
  693     }
  694     StubCodeMark mark(this, stub_id);
  695     start = __ pc();
  696 
  697     // Upon entry, LR points to the return address returning into
  698     // Java (interpreted or compiled) code; i.e., the return address
  699     // becomes the throwing pc.
  700     //
  701     // Arguments pushed before the runtime call are still on the stack
  702     // but the exception handler will reset the stack pointer ->
  703     // ignore them.  A potential result in registers can be ignored as
  704     // well.
  705 
  706 #ifdef ASSERT
  707     // make sure this code is only executed if there is a pending exception
  708     {
  709       Label L;
  710       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  711       __ cbnz(rscratch1, L);
  712       __ stop("StubRoutines::forward exception: no pending exception (1)");
  713       __ bind(L);
  714     }
  715 #endif
  716 
  717     // compute exception handler into r19
  718 
  719     // call the VM to find the handler address associated with the
  720     // caller address. pass thread in r0 and caller pc (ret address)
  721     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  722     // the stack.
  723     __ mov(c_rarg1, lr);
  724     // lr will be trashed by the VM call so we move it to R19
  725     // (callee-saved) because we also need to pass it to the handler
  726     // returned by this call.
  727     __ mov(r19, lr);
  728     BLOCK_COMMENT("call exception_handler_for_return_address");
  729     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  730                          SharedRuntime::exception_handler_for_return_address),
  731                     rthread, c_rarg1);
  732     // Reinitialize the ptrue predicate register, in case the external runtime
  733     // call clobbers ptrue reg, as we may return to SVE compiled code.
  734     __ reinitialize_ptrue();
  735 
  736     // we should not really care that lr is no longer the callee
  737     // address. we saved the value the handler needs in r19 so we can
  738     // just copy it to r3. however, the C2 handler will push its own
  739     // frame and then calls into the VM and the VM code asserts that
  740     // the PC for the frame above the handler belongs to a compiled
  741     // Java method. So, we restore lr here to satisfy that assert.
  742     __ mov(lr, r19);
  743     // setup r0 & r3 & clear pending exception
  744     __ mov(r3, r19);
  745     __ mov(r19, r0);
  746     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  747     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  748 
  749 #ifdef ASSERT
  750     // make sure exception is set
  751     {
  752       Label L;
  753       __ cbnz(r0, L);
  754       __ stop("StubRoutines::forward exception: no pending exception (2)");
  755       __ bind(L);
  756     }
  757 #endif
  758 
  759     // continue at exception handler
  760     // r0: exception
  761     // r3: throwing pc
  762     // r19: exception handler
  763     __ verify_oop(r0);
  764     __ br(r19);
  765 
  766     // record the stub entry and end
  767     store_archive_data(stub_id, start, __ pc());
  768 
  769     return start;
  770   }
  771 
  772   // Non-destructive plausibility checks for oops
  773   //
  774   // Arguments:
  775   //    r0: oop to verify
  776   //    rscratch1: error message
  777   //
  778   // Stack after saving c_rarg3:
  779   //    [tos + 0]: saved c_rarg3
  780   //    [tos + 1]: saved c_rarg2
  781   //    [tos + 2]: saved lr
  782   //    [tos + 3]: saved rscratch2
  783   //    [tos + 4]: saved r0
  784   //    [tos + 5]: saved rscratch1
  785   address generate_verify_oop() {
  786     StubId stub_id = StubId::stubgen_verify_oop_id;
  787     int entry_count = StubInfo::entry_count(stub_id);
  788     assert(entry_count == 1, "sanity check");
  789     address start = load_archive_data(stub_id);
  790     if (start != nullptr) {
  791       return start;
  792     }
  793     StubCodeMark mark(this, stub_id);
  794     start = __ pc();
  795 
  796     Label exit, error;
  797 
  798     // save c_rarg2 and c_rarg3
  799     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  800 
  801     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  802     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  803     __ ldr(c_rarg3, Address(c_rarg2));
  804     __ add(c_rarg3, c_rarg3, 1);
  805     __ str(c_rarg3, Address(c_rarg2));
  806 
  807     // object is in r0
  808     // make sure object is 'reasonable'
  809     __ cbz(r0, exit); // if obj is null it is OK
  810 
  811     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  812     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  813 
  814     // return if everything seems ok
  815     __ bind(exit);
  816 
  817     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  818     __ ret(lr);
  819 
  820     // handle errors
  821     __ bind(error);
  822     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  823 
  824     __ push(RegSet::range(r0, r29), sp);
  825     // debug(char* msg, int64_t pc, int64_t regs[])
  826     __ mov(c_rarg0, rscratch1);      // pass address of error message
  827     __ mov(c_rarg1, lr);             // pass return address
  828     __ mov(c_rarg2, sp);             // pass address of regs on stack
  829 #ifndef PRODUCT
  830     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  831 #endif
  832     BLOCK_COMMENT("call MacroAssembler::debug");
  833     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  834     __ blr(rscratch1);
  835     __ hlt(0);
  836 
  837     // record the stub entry and end
  838     store_archive_data(stub_id, start, __ pc());
  839 
  840     return start;
  841   }
  842 
  843   // Generate indices for iota vector.
  844   void generate_iota_indices(StubId stub_id) {
  845     GrowableArray<address> entries;
  846     int entry_count = StubInfo::entry_count(stub_id);
  847     assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
  848     address start = load_archive_data(stub_id, &entries);
  849     if (start != nullptr) {
  850       assert(entries.length() == entry_count - 1,
  851              "unexpected entries count %d", entries.length());
  852       StubRoutines::aarch64::_vector_iota_indices[0] = start;
  853       for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  854         StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  855       }
  856       return;
  857     }
  858     __ align(CodeEntryAlignment);
  859     StubCodeMark mark(this, stub_id);
  860     start = __ pc();
  861     // B
  862     __ emit_data64(0x0706050403020100, relocInfo::none);
  863     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  864     entries.append(__ pc());
  865     // H
  866     __ emit_data64(0x0003000200010000, relocInfo::none);
  867     __ emit_data64(0x0007000600050004, relocInfo::none);
  868     entries.append(__ pc());
  869     // S
  870     __ emit_data64(0x0000000100000000, relocInfo::none);
  871     __ emit_data64(0x0000000300000002, relocInfo::none);
  872     entries.append(__ pc());
  873     // D
  874     __ emit_data64(0x0000000000000000, relocInfo::none);
  875     __ emit_data64(0x0000000000000001, relocInfo::none);
  876     entries.append(__ pc());
  877     // S - FP
  878     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  879     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  880     entries.append(__ pc());
  881     // D - FP
  882     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  883     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  884 
  885     // record the stub entry and end
  886     store_archive_data(stub_id, start, __ pc(), &entries);
  887 
  888     // install the entry addresses in the entry array
  889     assert(entries.length() == entry_count - 1,
  890            "unexpected entries count %d", entries.length());
  891     StubRoutines::aarch64::_vector_iota_indices[0] = start;
  892     for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  893       StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  894     }
  895   }
  896 
  897   // The inner part of zero_words().  This is the bulk operation,
  898   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  899   // caller is responsible for zeroing the last few words.
  900   //
  901   // Inputs:
  902   // r10: the HeapWord-aligned base address of an array to zero.
  903   // r11: the count in HeapWords, r11 > 0.
  904   //
  905   // Returns r10 and r11, adjusted for the caller to clear.
  906   // r10: the base address of the tail of words left to clear.
  907   // r11: the number of words in the tail.
  908   //      r11 < MacroAssembler::zero_words_block_size.
  909 
  910   address generate_zero_blocks() {
  911     StubId stub_id = StubId::stubgen_zero_blocks_id;
  912     int entry_count = StubInfo::entry_count(stub_id);
  913     assert(entry_count == 1, "sanity check");
  914     address start = load_archive_data(stub_id);
  915     if (start != nullptr) {
  916       return start;
  917     }
  918     __ align(CodeEntryAlignment);
  919     StubCodeMark mark(this, stub_id);
  920     Label done;
  921     Label base_aligned;
  922 
  923     Register base = r10, cnt = r11;
  924 
  925     start = __ pc();
  926 
  927     if (UseBlockZeroing) {
  928       int zva_length = VM_Version::zva_length();
  929 
  930       // Ensure ZVA length can be divided by 16. This is required by
  931       // the subsequent operations.
  932       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  933 
  934       __ tbz(base, 3, base_aligned);
  935       __ str(zr, Address(__ post(base, 8)));
  936       __ sub(cnt, cnt, 1);
  937       __ bind(base_aligned);
  938 
  939       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  940       // alignment.
  941       Label small;
  942       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  943       __ subs(rscratch1, cnt, low_limit >> 3);
  944       __ br(Assembler::LT, small);
  945       __ zero_dcache_blocks(base, cnt);
  946       __ bind(small);
  947     }
  948 
  949     {
  950       // Number of stp instructions we'll unroll
  951       const int unroll =
  952         MacroAssembler::zero_words_block_size / 2;
  953       // Clear the remaining blocks.
  954       Label loop;
  955       __ subs(cnt, cnt, unroll * 2);
  956       __ br(Assembler::LT, done);
  957       __ bind(loop);
  958       for (int i = 0; i < unroll; i++)
  959         __ stp(zr, zr, __ post(base, 16));
  960       __ subs(cnt, cnt, unroll * 2);
  961       __ br(Assembler::GE, loop);
  962       __ bind(done);
  963       __ add(cnt, cnt, unroll * 2);
  964     }
  965 
  966     __ ret(lr);
  967 
  968     // record the stub entry and end
  969     store_archive_data(stub_id, start, __ pc());
  970 
  971     return start;
  972   }
  973 
  974 
  975   typedef enum {
  976     copy_forwards = 1,
  977     copy_backwards = -1
  978   } copy_direction;
  979 
  980   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  981   // for arraycopy stubs.
  982   class ArrayCopyBarrierSetHelper : StackObj {
  983     BarrierSetAssembler* _bs_asm;
  984     MacroAssembler* _masm;
  985     DecoratorSet _decorators;
  986     BasicType _type;
  987     Register _gct1;
  988     Register _gct2;
  989     Register _gct3;
  990     FloatRegister _gcvt1;
  991     FloatRegister _gcvt2;
  992     FloatRegister _gcvt3;
  993 
  994   public:
  995     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  996                               DecoratorSet decorators,
  997                               BasicType type,
  998                               Register gct1,
  999                               Register gct2,
 1000                               Register gct3,
 1001                               FloatRegister gcvt1,
 1002                               FloatRegister gcvt2,
 1003                               FloatRegister gcvt3)
 1004       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 1005         _masm(masm),
 1006         _decorators(decorators),
 1007         _type(type),
 1008         _gct1(gct1),
 1009         _gct2(gct2),
 1010         _gct3(gct3),
 1011         _gcvt1(gcvt1),
 1012         _gcvt2(gcvt2),
 1013         _gcvt3(gcvt3) {
 1014     }
 1015 
 1016     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 1017       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 1018                             dst1, dst2, src,
 1019                             _gct1, _gct2, _gcvt1);
 1020     }
 1021 
 1022     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 1023       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 1024                              dst, src1, src2,
 1025                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 1026     }
 1027 
 1028     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 1029       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 1030                             dst1, dst2, src,
 1031                             _gct1);
 1032     }
 1033 
 1034     void copy_store_at_16(Address dst, Register src1, Register src2) {
 1035       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 1036                              dst, src1, src2,
 1037                              _gct1, _gct2, _gct3);
 1038     }
 1039 
 1040     void copy_load_at_8(Register dst, Address src) {
 1041       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 1042                             dst, noreg, src,
 1043                             _gct1);
 1044     }
 1045 
 1046     void copy_store_at_8(Address dst, Register src) {
 1047       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 1048                              dst, src, noreg,
 1049                              _gct1, _gct2, _gct3);
 1050     }
 1051   };
 1052 
 1053   // Bulk copy of blocks of 8 words.
 1054   //
 1055   // count is a count of words.
 1056   //
 1057   // Precondition: count >= 8
 1058   //
 1059   // Postconditions:
 1060   //
 1061   // The least significant bit of count contains the remaining count
 1062   // of words to copy.  The rest of count is trash.
 1063   //
 1064   // s and d are adjusted to point to the remaining words to copy
 1065   //
 1066   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
 1067     int entry_count = StubInfo::entry_count(stub_id);
 1068     assert(entry_count == 1, "sanity check");
 1069     address start = load_archive_data(stub_id);
 1070     if (start != nullptr) {
 1071       return start;
 1072     }
 1073     BasicType type;
 1074     copy_direction direction;
 1075 
 1076     switch (stub_id) {
 1077     case StubId::stubgen_copy_byte_f_id:
 1078       direction = copy_forwards;
 1079       type = T_BYTE;
 1080       break;
 1081     case StubId::stubgen_copy_byte_b_id:
 1082       direction = copy_backwards;
 1083       type = T_BYTE;
 1084       break;
 1085     case StubId::stubgen_copy_oop_f_id:
 1086       direction = copy_forwards;
 1087       type = T_OBJECT;
 1088       break;
 1089     case StubId::stubgen_copy_oop_b_id:
 1090       direction = copy_backwards;
 1091       type = T_OBJECT;
 1092       break;
 1093     case StubId::stubgen_copy_oop_uninit_f_id:
 1094       direction = copy_forwards;
 1095       type = T_OBJECT;
 1096       break;
 1097     case StubId::stubgen_copy_oop_uninit_b_id:
 1098       direction = copy_backwards;
 1099       type = T_OBJECT;
 1100       break;
 1101     default:
 1102       ShouldNotReachHere();
 1103     }
 1104 
 1105     int unit = wordSize * direction;
 1106     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 1107 
 1108     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 1109       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 1110     const Register stride = r14;
 1111     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1112     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1113     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1114 
 1115     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 1116     assert_different_registers(s, d, count, rscratch1, rscratch2);
 1117 
 1118     Label again, drain;
 1119 
 1120     __ align(CodeEntryAlignment);
 1121 
 1122     StubCodeMark mark(this, stub_id);
 1123 
 1124     start = __ pc();
 1125 
 1126     Label unaligned_copy_long;
 1127     if (AvoidUnalignedAccesses) {
 1128       __ tbnz(d, 3, unaligned_copy_long);
 1129     }
 1130 
 1131     if (direction == copy_forwards) {
 1132       __ sub(s, s, bias);
 1133       __ sub(d, d, bias);
 1134     }
 1135 
 1136 #ifdef ASSERT
 1137     // Make sure we are never given < 8 words
 1138     {
 1139       Label L;
 1140       __ cmp(count, (u1)8);
 1141       __ br(Assembler::GE, L);
 1142       __ stop("genrate_copy_longs called with < 8 words");
 1143       __ bind(L);
 1144     }
 1145 #endif
 1146 
 1147     // Fill 8 registers
 1148     if (UseSIMDForMemoryOps) {
 1149       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1150       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1151     } else {
 1152       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1153       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1154       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1155       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1156     }
 1157 
 1158     __ subs(count, count, 16);
 1159     __ br(Assembler::LO, drain);
 1160 
 1161     int prefetch = PrefetchCopyIntervalInBytes;
 1162     bool use_stride = false;
 1163     if (direction == copy_backwards) {
 1164       use_stride = prefetch > 256;
 1165       prefetch = -prefetch;
 1166       if (use_stride) __ mov(stride, prefetch);
 1167     }
 1168 
 1169     __ bind(again);
 1170 
 1171     if (PrefetchCopyIntervalInBytes > 0)
 1172       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1173 
 1174     if (UseSIMDForMemoryOps) {
 1175       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1176       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1177       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1178       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1179     } else {
 1180       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1181       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1182       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1183       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1184       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1185       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1186       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1187       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1188     }
 1189 
 1190     __ subs(count, count, 8);
 1191     __ br(Assembler::HS, again);
 1192 
 1193     // Drain
 1194     __ bind(drain);
 1195     if (UseSIMDForMemoryOps) {
 1196       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1197       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1198     } else {
 1199       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1200       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1201       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1202       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1203     }
 1204 
 1205     {
 1206       Label L1, L2;
 1207       __ tbz(count, exact_log2(4), L1);
 1208       if (UseSIMDForMemoryOps) {
 1209         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 1210         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 1211       } else {
 1212         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1213         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1214         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1215         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 1216       }
 1217       __ bind(L1);
 1218 
 1219       if (direction == copy_forwards) {
 1220         __ add(s, s, bias);
 1221         __ add(d, d, bias);
 1222       }
 1223 
 1224       __ tbz(count, 1, L2);
 1225       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 1226       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 1227       __ bind(L2);
 1228     }
 1229 
 1230     __ ret(lr);
 1231 
 1232     if (AvoidUnalignedAccesses) {
 1233       Label drain, again;
 1234       // Register order for storing. Order is different for backward copy.
 1235 
 1236       __ bind(unaligned_copy_long);
 1237 
 1238       // source address is even aligned, target odd aligned
 1239       //
 1240       // when forward copying word pairs we read long pairs at offsets
 1241       // {0, 2, 4, 6} (in long words). when backwards copying we read
 1242       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 1243       // address by -2 in the forwards case so we can compute the
 1244       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 1245       // or -1.
 1246       //
 1247       // when forward copying we need to store 1 word, 3 pairs and
 1248       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 1249       // zero offset We adjust the destination by -1 which means we
 1250       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1251       //
 1252       // When backwards copyng we need to store 1 word, 3 pairs and
 1253       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1254       // offsets {1, 3, 5, 7, 8} * unit.
 1255 
 1256       if (direction == copy_forwards) {
 1257         __ sub(s, s, 16);
 1258         __ sub(d, d, 8);
 1259       }
 1260 
 1261       // Fill 8 registers
 1262       //
 1263       // for forwards copy s was offset by -16 from the original input
 1264       // value of s so the register contents are at these offsets
 1265       // relative to the 64 bit block addressed by that original input
 1266       // and so on for each successive 64 byte block when s is updated
 1267       //
 1268       // t0 at offset 0,  t1 at offset 8
 1269       // t2 at offset 16, t3 at offset 24
 1270       // t4 at offset 32, t5 at offset 40
 1271       // t6 at offset 48, t7 at offset 56
 1272 
 1273       // for backwards copy s was not offset so the register contents
 1274       // are at these offsets into the preceding 64 byte block
 1275       // relative to that original input and so on for each successive
 1276       // preceding 64 byte block when s is updated. this explains the
 1277       // slightly counter-intuitive looking pattern of register usage
 1278       // in the stp instructions for backwards copy.
 1279       //
 1280       // t0 at offset -16, t1 at offset -8
 1281       // t2 at offset -32, t3 at offset -24
 1282       // t4 at offset -48, t5 at offset -40
 1283       // t6 at offset -64, t7 at offset -56
 1284 
 1285       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1286       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1287       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1288       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1289 
 1290       __ subs(count, count, 16);
 1291       __ br(Assembler::LO, drain);
 1292 
 1293       int prefetch = PrefetchCopyIntervalInBytes;
 1294       bool use_stride = false;
 1295       if (direction == copy_backwards) {
 1296         use_stride = prefetch > 256;
 1297         prefetch = -prefetch;
 1298         if (use_stride) __ mov(stride, prefetch);
 1299       }
 1300 
 1301       __ bind(again);
 1302 
 1303       if (PrefetchCopyIntervalInBytes > 0)
 1304         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1305 
 1306       if (direction == copy_forwards) {
 1307         // allowing for the offset of -8 the store instructions place
 1308         // registers into the target 64 bit block at the following
 1309         // offsets
 1310         //
 1311         // t0 at offset 0
 1312         // t1 at offset 8,  t2 at offset 16
 1313         // t3 at offset 24, t4 at offset 32
 1314         // t5 at offset 40, t6 at offset 48
 1315         // t7 at offset 56
 1316 
 1317         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1318         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1319         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1320         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1321         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1322         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1323         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1324         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1325         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1326       } else {
 1327         // d was not offset when we started so the registers are
 1328         // written into the 64 bit block preceding d with the following
 1329         // offsets
 1330         //
 1331         // t1 at offset -8
 1332         // t3 at offset -24, t0 at offset -16
 1333         // t5 at offset -48, t2 at offset -32
 1334         // t7 at offset -56, t4 at offset -48
 1335         //                   t6 at offset -64
 1336         //
 1337         // note that this matches the offsets previously noted for the
 1338         // loads
 1339 
 1340         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1341         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1342         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1343         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1344         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1345         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1346         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1347         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1348         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1349       }
 1350 
 1351       __ subs(count, count, 8);
 1352       __ br(Assembler::HS, again);
 1353 
 1354       // Drain
 1355       //
 1356       // this uses the same pattern of offsets and register arguments
 1357       // as above
 1358       __ bind(drain);
 1359       if (direction == copy_forwards) {
 1360         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1361         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1362         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1363         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1364         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1365       } else {
 1366         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1367         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1368         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1369         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1370         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1371       }
 1372       // now we need to copy any remaining part block which may
 1373       // include a 4 word block subblock and/or a 2 word subblock.
 1374       // bits 2 and 1 in the count are the tell-tale for whether we
 1375       // have each such subblock
 1376       {
 1377         Label L1, L2;
 1378         __ tbz(count, exact_log2(4), L1);
 1379         // this is the same as above but copying only 4 longs hence
 1380         // with only one intervening stp between the str instructions
 1381         // but note that the offsets and registers still follow the
 1382         // same pattern
 1383         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1384         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1385         if (direction == copy_forwards) {
 1386           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1387           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1388           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1389         } else {
 1390           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1391           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1392           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1393         }
 1394         __ bind(L1);
 1395 
 1396         __ tbz(count, 1, L2);
 1397         // this is the same as above but copying only 2 longs hence
 1398         // there is no intervening stp between the str instructions
 1399         // but note that the offset and register patterns are still
 1400         // the same
 1401         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1402         if (direction == copy_forwards) {
 1403           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1404           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1405         } else {
 1406           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1407           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1408         }
 1409         __ bind(L2);
 1410 
 1411         // for forwards copy we need to re-adjust the offsets we
 1412         // applied so that s and d are follow the last words written
 1413 
 1414         if (direction == copy_forwards) {
 1415           __ add(s, s, 16);
 1416           __ add(d, d, 8);
 1417         }
 1418 
 1419       }
 1420 
 1421       __ ret(lr);
 1422     }
 1423 
 1424     // record the stub entry and end
 1425     store_archive_data(stub_id, start, __ pc());
 1426 
 1427     return start;
 1428   }
 1429 
 1430   // Small copy: less than 16 bytes.
 1431   //
 1432   // NB: Ignores all of the bits of count which represent more than 15
 1433   // bytes, so a caller doesn't have to mask them.
 1434 
 1435   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1436     bool is_backwards = step < 0;
 1437     size_t granularity = g_uabs(step);
 1438     int direction = is_backwards ? -1 : 1;
 1439 
 1440     Label Lword, Lint, Lshort, Lbyte;
 1441 
 1442     assert(granularity
 1443            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1444 
 1445     const Register t0 = r3;
 1446     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1447     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1448 
 1449     // ??? I don't know if this bit-test-and-branch is the right thing
 1450     // to do.  It does a lot of jumping, resulting in several
 1451     // mispredicted branches.  It might make more sense to do this
 1452     // with something like Duff's device with a single computed branch.
 1453 
 1454     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1455     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1456     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1457     __ bind(Lword);
 1458 
 1459     if (granularity <= sizeof (jint)) {
 1460       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1461       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1462       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1463       __ bind(Lint);
 1464     }
 1465 
 1466     if (granularity <= sizeof (jshort)) {
 1467       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1468       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1469       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1470       __ bind(Lshort);
 1471     }
 1472 
 1473     if (granularity <= sizeof (jbyte)) {
 1474       __ tbz(count, 0, Lbyte);
 1475       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1476       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1477       __ bind(Lbyte);
 1478     }
 1479   }
 1480 
 1481   // All-singing all-dancing memory copy.
 1482   //
 1483   // Copy count units of memory from s to d.  The size of a unit is
 1484   // step, which can be positive or negative depending on the direction
 1485   // of copy.  If is_aligned is false, we align the source address.
 1486   //
 1487 
 1488   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1489                    Register s, Register d, Register count, int step) {
 1490     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1491     bool is_backwards = step < 0;
 1492     unsigned int granularity = g_uabs(step);
 1493     const Register t0 = r3, t1 = r4;
 1494 
 1495     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1496     // load all the data before writing anything
 1497     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1498     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1499     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1500     const Register send = r17, dend = r16;
 1501     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1502     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1503     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1504 
 1505     if (PrefetchCopyIntervalInBytes > 0)
 1506       __ prfm(Address(s, 0), PLDL1KEEP);
 1507     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1508     __ br(Assembler::HI, copy_big);
 1509 
 1510     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1511     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1512 
 1513     __ cmp(count, u1(16/granularity));
 1514     __ br(Assembler::LS, copy16);
 1515 
 1516     __ cmp(count, u1(64/granularity));
 1517     __ br(Assembler::HI, copy80);
 1518 
 1519     __ cmp(count, u1(32/granularity));
 1520     __ br(Assembler::LS, copy32);
 1521 
 1522     // 33..64 bytes
 1523     if (UseSIMDForMemoryOps) {
 1524       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1525       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1526       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1527       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1528     } else {
 1529       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1530       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1531       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1532       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1533 
 1534       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1535       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1536       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1537       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1538     }
 1539     __ b(finish);
 1540 
 1541     // 17..32 bytes
 1542     __ bind(copy32);
 1543     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1544     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1545 
 1546     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1547     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1548     __ b(finish);
 1549 
 1550     // 65..80/96 bytes
 1551     // (96 bytes if SIMD because we do 32 byes per instruction)
 1552     __ bind(copy80);
 1553     if (UseSIMDForMemoryOps) {
 1554       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1555       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1556       // Unaligned pointers can be an issue for copying.
 1557       // The issue has more chances to happen when granularity of data is
 1558       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1559       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1560       // The most performance drop has been seen for the range 65-80 bytes.
 1561       // For such cases using the pair of ldp/stp instead of the third pair of
 1562       // ldpq/stpq fixes the performance issue.
 1563       if (granularity < sizeof (jint)) {
 1564         Label copy96;
 1565         __ cmp(count, u1(80/granularity));
 1566         __ br(Assembler::HI, copy96);
 1567         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1568 
 1569         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1570         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1571 
 1572         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1573         __ b(finish);
 1574 
 1575         __ bind(copy96);
 1576       }
 1577       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1578 
 1579       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1580       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1581 
 1582       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1583     } else {
 1584       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1585       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1586       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1587       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1588       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1589 
 1590       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1591       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1592       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1593       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1594       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1595     }
 1596     __ b(finish);
 1597 
 1598     // 0..16 bytes
 1599     __ bind(copy16);
 1600     __ cmp(count, u1(8/granularity));
 1601     __ br(Assembler::LO, copy8);
 1602 
 1603     // 8..16 bytes
 1604     bs.copy_load_at_8(t0, Address(s, 0));
 1605     bs.copy_load_at_8(t1, Address(send, -8));
 1606     bs.copy_store_at_8(Address(d, 0), t0);
 1607     bs.copy_store_at_8(Address(dend, -8), t1);
 1608     __ b(finish);
 1609 
 1610     if (granularity < 8) {
 1611       // 4..7 bytes
 1612       __ bind(copy8);
 1613       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1614       __ ldrw(t0, Address(s, 0));
 1615       __ ldrw(t1, Address(send, -4));
 1616       __ strw(t0, Address(d, 0));
 1617       __ strw(t1, Address(dend, -4));
 1618       __ b(finish);
 1619       if (granularity < 4) {
 1620         // 0..3 bytes
 1621         __ bind(copy4);
 1622         __ cbz(count, finish); // get rid of 0 case
 1623         if (granularity == 2) {
 1624           __ ldrh(t0, Address(s, 0));
 1625           __ strh(t0, Address(d, 0));
 1626         } else { // granularity == 1
 1627           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1628           // the first and last byte.
 1629           // Handle the 3 byte case by loading and storing base + count/2
 1630           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1631           // This does means in the 1 byte case we load/store the same
 1632           // byte 3 times.
 1633           __ lsr(count, count, 1);
 1634           __ ldrb(t0, Address(s, 0));
 1635           __ ldrb(t1, Address(send, -1));
 1636           __ ldrb(t2, Address(s, count));
 1637           __ strb(t0, Address(d, 0));
 1638           __ strb(t1, Address(dend, -1));
 1639           __ strb(t2, Address(d, count));
 1640         }
 1641         __ b(finish);
 1642       }
 1643     }
 1644 
 1645     __ bind(copy_big);
 1646     if (is_backwards) {
 1647       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1648       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1649     }
 1650 
 1651     // Now we've got the small case out of the way we can align the
 1652     // source address on a 2-word boundary.
 1653 
 1654     // Here we will materialize a count in r15, which is used by copy_memory_small
 1655     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1656     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1657     // can not be used as a temp register, as it contains the count.
 1658 
 1659     Label aligned;
 1660 
 1661     if (is_aligned) {
 1662       // We may have to adjust by 1 word to get s 2-word-aligned.
 1663       __ tbz(s, exact_log2(wordSize), aligned);
 1664       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1665       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1666       __ sub(count, count, wordSize/granularity);
 1667     } else {
 1668       if (is_backwards) {
 1669         __ andr(r15, s, 2 * wordSize - 1);
 1670       } else {
 1671         __ neg(r15, s);
 1672         __ andr(r15, r15, 2 * wordSize - 1);
 1673       }
 1674       // r15 is the byte adjustment needed to align s.
 1675       __ cbz(r15, aligned);
 1676       int shift = exact_log2(granularity);
 1677       if (shift > 0) {
 1678         __ lsr(r15, r15, shift);
 1679       }
 1680       __ sub(count, count, r15);
 1681 
 1682 #if 0
 1683       // ?? This code is only correct for a disjoint copy.  It may or
 1684       // may not make sense to use it in that case.
 1685 
 1686       // Copy the first pair; s and d may not be aligned.
 1687       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1688       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1689 
 1690       // Align s and d, adjust count
 1691       if (is_backwards) {
 1692         __ sub(s, s, r15);
 1693         __ sub(d, d, r15);
 1694       } else {
 1695         __ add(s, s, r15);
 1696         __ add(d, d, r15);
 1697       }
 1698 #else
 1699       copy_memory_small(decorators, type, s, d, r15, step);
 1700 #endif
 1701     }
 1702 
 1703     __ bind(aligned);
 1704 
 1705     // s is now 2-word-aligned.
 1706 
 1707     // We have a count of units and some trailing bytes. Adjust the
 1708     // count and do a bulk copy of words. If the shift is zero
 1709     // perform a move instead to benefit from zero latency moves.
 1710     int shift = exact_log2(wordSize/granularity);
 1711     if (shift > 0) {
 1712       __ lsr(r15, count, shift);
 1713     } else {
 1714       __ mov(r15, count);
 1715     }
 1716     if (direction == copy_forwards) {
 1717       if (type != T_OBJECT) {
 1718         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
 1719         __ blr(rscratch1);
 1720       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1721         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
 1722         __ blr(rscratch1);
 1723       } else {
 1724         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
 1725         __ blr(rscratch1);
 1726       }
 1727     } else {
 1728       if (type != T_OBJECT) {
 1729         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
 1730         __ blr(rscratch1);
 1731       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1732         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
 1733         __ blr(rscratch1);
 1734       } else {
 1735         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
 1736         __ blr(rscratch1);
 1737       }
 1738     }
 1739 
 1740     // And the tail.
 1741     copy_memory_small(decorators, type, s, d, count, step);
 1742 
 1743     if (granularity >= 8) __ bind(copy8);
 1744     if (granularity >= 4) __ bind(copy4);
 1745     __ bind(finish);
 1746   }
 1747 
 1748 
 1749   void clobber_registers() {
 1750 #ifdef ASSERT
 1751     RegSet clobbered
 1752       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1753     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1754     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1755     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1756       __ mov(*it, rscratch1);
 1757     }
 1758 #endif
 1759 
 1760   }
 1761 
 1762   // Scan over array at a for count oops, verifying each one.
 1763   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1764   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1765     Label loop, end;
 1766     __ mov(rscratch1, a);
 1767     __ mov(rscratch2, zr);
 1768     __ bind(loop);
 1769     __ cmp(rscratch2, count);
 1770     __ br(Assembler::HS, end);
 1771     if (size == wordSize) {
 1772       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1773       __ verify_oop(temp);
 1774     } else {
 1775       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1776       __ decode_heap_oop(temp); // calls verify_oop
 1777     }
 1778     __ add(rscratch2, rscratch2, 1);
 1779     __ b(loop);
 1780     __ bind(end);
 1781   }
 1782 
 1783   // Arguments:
 1784   //   stub_id - is used to name the stub and identify all details of
 1785   //             how to perform the copy.
 1786   //
 1787   //   nopush_entry - is assigned to the stub's post push entry point
 1788   //                  unless it is null
 1789   //
 1790   // Inputs:
 1791   //   c_rarg0   - source array address
 1792   //   c_rarg1   - destination array address
 1793   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1794   //
 1795   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1796   // the hardware handle it.  The two dwords within qwords that span
 1797   // cache line boundaries will still be loaded and stored atomically.
 1798   //
 1799   // Side Effects: nopush_entry is set to the (post push) entry point
 1800   //               so it can be used by the corresponding conjoint
 1801   //               copy method
 1802   //
 1803   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1804     int size;
 1805     bool aligned;
 1806     bool is_oop;
 1807     bool dest_uninitialized;
 1808     switch (stub_id) {
 1809     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1810       size = sizeof(jbyte);
 1811       aligned = false;
 1812       is_oop = false;
 1813       dest_uninitialized = false;
 1814       break;
 1815     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1816       size = sizeof(jbyte);
 1817       aligned = true;
 1818       is_oop = false;
 1819       dest_uninitialized = false;
 1820       break;
 1821     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1822       size = sizeof(jshort);
 1823       aligned = false;
 1824       is_oop = false;
 1825       dest_uninitialized = false;
 1826       break;
 1827     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1828       size = sizeof(jshort);
 1829       aligned = true;
 1830       is_oop = false;
 1831       dest_uninitialized = false;
 1832       break;
 1833     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1834       size = sizeof(jint);
 1835       aligned = false;
 1836       is_oop = false;
 1837       dest_uninitialized = false;
 1838       break;
 1839     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1840       size = sizeof(jint);
 1841       aligned = true;
 1842       is_oop = false;
 1843       dest_uninitialized = false;
 1844       break;
 1845     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1846       // since this is always aligned we can (should!) use the same
 1847       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1848       ShouldNotReachHere();
 1849       break;
 1850     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1851       size = sizeof(jlong);
 1852       aligned = true;
 1853       is_oop = false;
 1854       dest_uninitialized = false;
 1855       break;
 1856     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1857       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1858       aligned = !UseCompressedOops;
 1859       is_oop = true;
 1860       dest_uninitialized = false;
 1861       break;
 1862     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1863       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1864       aligned = !UseCompressedOops;
 1865       is_oop = true;
 1866       dest_uninitialized = false;
 1867       break;
 1868     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1869       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1870       aligned = !UseCompressedOops;
 1871       is_oop = true;
 1872       dest_uninitialized = true;
 1873       break;
 1874     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1875       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1876       aligned = !UseCompressedOops;
 1877       is_oop = true;
 1878       dest_uninitialized = true;
 1879       break;
 1880     default:
 1881       ShouldNotReachHere();
 1882       break;
 1883     }
 1884     // all stubs provide a 2nd entry which omits the frame push for
 1885     // use when bailing out from a conjoint copy. However we may also
 1886     // need some extra addressses for memory access protection.
 1887     int entry_count = StubInfo::entry_count(stub_id);
 1888     assert(entry_count == 2, "sanity check");
 1889     assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
 1890 
 1891     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 1892     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 1893     GrowableArray<address> entries;
 1894     GrowableArray<address> extras;
 1895     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 1896     address start = load_archive_data(stub_id, &entries, extras_ptr);
 1897     if (start != nullptr) {
 1898       assert(entries.length() == entry_count - 1,
 1899              "unexpected entries count %d", entries.length());
 1900       *nopush_entry = entries.at(0);
 1901       assert(extras.length() == extra_count,
 1902              "unexpected extra count %d", extras.length());
 1903       if (add_extras) {
 1904         // register one handler at offset 0
 1905         register_unsafe_access_handlers(extras, 0, 1);
 1906       }
 1907       return start;
 1908     }
 1909 
 1910     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1911     RegSet saved_reg = RegSet::of(s, d, count);
 1912 
 1913     __ align(CodeEntryAlignment);
 1914     StubCodeMark mark(this, stub_id);
 1915     start = __ pc();
 1916     __ enter();
 1917 
 1918     *nopush_entry = __ pc();
 1919     entries.append(*nopush_entry);
 1920 
 1921     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1922     BLOCK_COMMENT("Post-Push Entry:");
 1923 
 1924     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1925     if (dest_uninitialized) {
 1926       decorators |= IS_DEST_UNINITIALIZED;
 1927     }
 1928     if (aligned) {
 1929       decorators |= ARRAYCOPY_ALIGNED;
 1930     }
 1931 
 1932     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1933     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1934 
 1935     if (is_oop) {
 1936       // save regs before copy_memory
 1937       __ push(RegSet::of(d, count), sp);
 1938     }
 1939     {
 1940       // UnsafeMemoryAccess page error: continue after unsafe access
 1941       UnsafeMemoryAccessMark umam(this, add_extras, true);
 1942       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1943     }
 1944 
 1945     if (is_oop) {
 1946       __ pop(RegSet::of(d, count), sp);
 1947       if (VerifyOops)
 1948         verify_oop_array(size, d, count, r16);
 1949     }
 1950 
 1951     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1952 
 1953     __ leave();
 1954     __ mov(r0, zr); // return 0
 1955     __ ret(lr);
 1956 
 1957     address end = __ pc();
 1958 
 1959     if (add_extras) {
 1960       // retrieve the registered handler addresses
 1961       retrieve_unsafe_access_handlers(start, end, extras);
 1962       assert(extras.length() == extra_count
 1963              , "incorrect handlers count %d", extras.length());
 1964     }
 1965 
 1966     // record the stub entry and end plus the no_push entry and any
 1967     // extra handler addresses
 1968     store_archive_data(stub_id, start, end, &entries, extras_ptr);
 1969 
 1970     return start;
 1971   }
 1972 
 1973   // Arguments:
 1974   //   stub_id - is used to name the stub and identify all details of
 1975   //             how to perform the copy.
 1976   //
 1977   //   nooverlap_target - identifes the (post push) entry for the
 1978   //             corresponding disjoint copy routine which can be
 1979   //             jumped to if the ranges do not actually overlap
 1980   //
 1981   //   nopush_entry - is assigned to the stub's post push entry point
 1982   //                  unless it is null
 1983   //
 1984   //
 1985   // Inputs:
 1986   //   c_rarg0   - source array address
 1987   //   c_rarg1   - destination array address
 1988   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1989   //
 1990   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1991   // the hardware handle it.  The two dwords within qwords that span
 1992   // cache line boundaries will still be loaded and stored atomically.
 1993   //
 1994   // Side Effects:
 1995   //   nopush_entry is set to the no-overlap entry point so it can be
 1996   //   used by some other conjoint copy method
 1997   //
 1998   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1999     int size;
 2000     bool aligned;
 2001     bool is_oop;
 2002     bool dest_uninitialized;
 2003     switch (stub_id) {
 2004     case StubId::stubgen_jbyte_arraycopy_id:
 2005       size = sizeof(jbyte);
 2006       aligned = false;
 2007       is_oop = false;
 2008       dest_uninitialized = false;
 2009       break;
 2010     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 2011       size = sizeof(jbyte);
 2012       aligned = true;
 2013       is_oop = false;
 2014       dest_uninitialized = false;
 2015       break;
 2016     case StubId::stubgen_jshort_arraycopy_id:
 2017       size = sizeof(jshort);
 2018       aligned = false;
 2019       is_oop = false;
 2020       dest_uninitialized = false;
 2021       break;
 2022     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 2023       size = sizeof(jshort);
 2024       aligned = true;
 2025       is_oop = false;
 2026       dest_uninitialized = false;
 2027       break;
 2028     case StubId::stubgen_jint_arraycopy_id:
 2029       size = sizeof(jint);
 2030       aligned = false;
 2031       is_oop = false;
 2032       dest_uninitialized = false;
 2033       break;
 2034     case StubId::stubgen_arrayof_jint_arraycopy_id:
 2035       size = sizeof(jint);
 2036       aligned = true;
 2037       is_oop = false;
 2038       dest_uninitialized = false;
 2039       break;
 2040     case StubId::stubgen_jlong_arraycopy_id:
 2041       // since this is always aligned we can (should!) use the same
 2042       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 2043       ShouldNotReachHere();
 2044       break;
 2045     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 2046       size = sizeof(jlong);
 2047       aligned = true;
 2048       is_oop = false;
 2049       dest_uninitialized = false;
 2050       break;
 2051     case StubId::stubgen_oop_arraycopy_id:
 2052       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2053       aligned = !UseCompressedOops;
 2054       is_oop = true;
 2055       dest_uninitialized = false;
 2056       break;
 2057     case StubId::stubgen_arrayof_oop_arraycopy_id:
 2058       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2059       aligned = !UseCompressedOops;
 2060       is_oop = true;
 2061       dest_uninitialized = false;
 2062       break;
 2063     case StubId::stubgen_oop_arraycopy_uninit_id:
 2064       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2065       aligned = !UseCompressedOops;
 2066       is_oop = true;
 2067       dest_uninitialized = true;
 2068       break;
 2069     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 2070       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2071       aligned = !UseCompressedOops;
 2072       is_oop = true;
 2073       dest_uninitialized = true;
 2074       break;
 2075     default:
 2076       ShouldNotReachHere();
 2077     }
 2078     // only some conjoint stubs generate a 2nd entry
 2079     int entry_count = StubInfo::entry_count(stub_id);
 2080     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2081     assert(entry_count == expected_entry_count,
 2082            "expected entry count %d does not match declared entry count %d for stub %s",
 2083            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2084 
 2085     // We need to protect memory accesses in certain cases
 2086     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 2087     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 2088     GrowableArray<address> entries;
 2089     GrowableArray<address> extras;
 2090     GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
 2091     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 2092     address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 2093     if (start != nullptr) {
 2094       assert(entries.length() == expected_entry_count - 1,
 2095              "unexpected entries count %d", entries.length());
 2096       assert(extras.length() == extra_count,
 2097              "unexpected extra count %d", extras.length());
 2098       if (nopush_entry != nullptr) {
 2099         *nopush_entry = entries.at(0);
 2100       }
 2101       if (add_extras) {
 2102         // register one handler at offset 0
 2103         register_unsafe_access_handlers(extras, 0, 1);
 2104       }
 2105       return start;
 2106     }
 2107 
 2108     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2109     RegSet saved_regs = RegSet::of(s, d, count);
 2110     StubCodeMark mark(this, stub_id);
 2111     start = __ pc();
 2112     __ enter();
 2113 
 2114     if (nopush_entry != nullptr) {
 2115       *nopush_entry = __ pc();
 2116       entries.append(*nopush_entry);
 2117       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 2118       BLOCK_COMMENT("Post-Push Entry:");
 2119     }
 2120 
 2121     // use fwd copy when (d-s) above_equal (count*size)
 2122     Label L_overlapping;
 2123     __ sub(rscratch1, d, s);
 2124     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 2125     __ br(Assembler::LO, L_overlapping);
 2126     __ b(RuntimeAddress(nooverlap_target));
 2127     __ bind(L_overlapping);
 2128 
 2129     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 2130     if (dest_uninitialized) {
 2131       decorators |= IS_DEST_UNINITIALIZED;
 2132     }
 2133     if (aligned) {
 2134       decorators |= ARRAYCOPY_ALIGNED;
 2135     }
 2136 
 2137     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2138     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 2139 
 2140     if (is_oop) {
 2141       // save regs before copy_memory
 2142       __ push(RegSet::of(d, count), sp);
 2143     }
 2144     {
 2145       // UnsafeMemoryAccess page error: continue after unsafe access
 2146       UnsafeMemoryAccessMark umam(this, add_extras, true);
 2147       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 2148     }
 2149     if (is_oop) {
 2150       __ pop(RegSet::of(d, count), sp);
 2151       if (VerifyOops)
 2152         verify_oop_array(size, d, count, r16);
 2153     }
 2154     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 2155     __ leave();
 2156     __ mov(r0, zr); // return 0
 2157     __ ret(lr);
 2158 
 2159     assert(entries.length() == expected_entry_count - 1,
 2160            "unexpected entries count %d", entries.length());
 2161 
 2162     address end = __ pc();
 2163 
 2164     if (add_extras) {
 2165       // retrieve the registered handler addresses
 2166       retrieve_unsafe_access_handlers(start, end, extras);
 2167       assert(extras.length() == extra_count,
 2168              "incorrect handlers count %d", extras.length());
 2169     }
 2170 
 2171     // record the stub entry and end plus any no_push entry and/or
 2172     // extra handler addresses
 2173     store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
 2174 
 2175     return start;
 2176   }
 2177 
 2178   // Helper for generating a dynamic type check.
 2179   // Smashes rscratch1, rscratch2.
 2180   void generate_type_check(Register sub_klass,
 2181                            Register super_check_offset,
 2182                            Register super_klass,
 2183                            Register temp1,
 2184                            Register temp2,
 2185                            Register result,
 2186                            Label& L_success) {
 2187     assert_different_registers(sub_klass, super_check_offset, super_klass);
 2188 
 2189     BLOCK_COMMENT("type_check:");
 2190 
 2191     Label L_miss;
 2192 
 2193     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 2194                                      super_check_offset);
 2195     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 2196 
 2197     // Fall through on failure!
 2198     __ BIND(L_miss);
 2199   }
 2200 
 2201   //
 2202   //  Generate checkcasting array copy stub
 2203   //
 2204   //  Input:
 2205   //    c_rarg0   - source array address
 2206   //    c_rarg1   - destination array address
 2207   //    c_rarg2   - element count, treated as ssize_t, can be zero
 2208   //    c_rarg3   - size_t ckoff (super_check_offset)
 2209   //    c_rarg4   - oop ckval (super_klass)
 2210   //
 2211   //  Output:
 2212   //    r0 ==  0  -  success
 2213   //    r0 == -1^K - failure, where K is partial transfer count
 2214   //
 2215   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 2216     bool dest_uninitialized;
 2217     switch (stub_id) {
 2218     case StubId::stubgen_checkcast_arraycopy_id:
 2219       dest_uninitialized = false;
 2220       break;
 2221     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 2222       dest_uninitialized = true;
 2223       break;
 2224     default:
 2225       ShouldNotReachHere();
 2226     }
 2227 
 2228     // The normal stub provides a 2nd entry which omits the frame push
 2229     // for use when bailing out from a disjoint copy.
 2230     // Only some conjoint stubs generate a 2nd entry
 2231     int entry_count = StubInfo::entry_count(stub_id);
 2232     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2233     GrowableArray<address> entries;
 2234     GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
 2235     assert(entry_count == expected_entry_count,
 2236            "expected entry count %d does not match declared entry count %d for stub %s",
 2237            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2238     address start = load_archive_data(stub_id, entries_ptr);
 2239     if (start != nullptr) {
 2240       assert(entries.length() + 1 == expected_entry_count,
 2241              "expected entry count %d does not match return entry count %d for stub %s",
 2242              expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
 2243       if (nopush_entry != nullptr) {
 2244         *nopush_entry = entries.at(0);
 2245       }
 2246       return start;
 2247     }
 2248 
 2249     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 2250 
 2251     // Input registers (after setup_arg_regs)
 2252     const Register from        = c_rarg0;   // source array address
 2253     const Register to          = c_rarg1;   // destination array address
 2254     const Register count       = c_rarg2;   // elementscount
 2255     const Register ckoff       = c_rarg3;   // super_check_offset
 2256     const Register ckval       = c_rarg4;   // super_klass
 2257 
 2258     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 2259 
 2260     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 2261     const Register copied_oop  = r22;       // actual oop copied
 2262     const Register count_save  = r21;       // orig elementscount
 2263     const Register start_to    = r20;       // destination array start address
 2264     const Register r19_klass   = r19;       // oop._klass
 2265 
 2266     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 2267     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 2268 
 2269     //---------------------------------------------------------------
 2270     // Assembler stub will be used for this call to arraycopy
 2271     // if the two arrays are subtypes of Object[] but the
 2272     // destination array type is not equal to or a supertype
 2273     // of the source type.  Each element must be separately
 2274     // checked.
 2275 
 2276     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 2277                                copied_oop, r19_klass, count_save);
 2278 
 2279     __ align(CodeEntryAlignment);
 2280     StubCodeMark mark(this, stub_id);
 2281     start = __ pc();
 2282 
 2283     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2284 
 2285 #ifdef ASSERT
 2286     // caller guarantees that the arrays really are different
 2287     // otherwise, we would have to make conjoint checks
 2288     { Label L;
 2289       __ b(L);                  // conjoint check not yet implemented
 2290       __ stop("checkcast_copy within a single array");
 2291       __ bind(L);
 2292     }
 2293 #endif //ASSERT
 2294 
 2295     // Caller of this entry point must set up the argument registers.
 2296     if (nopush_entry != nullptr) {
 2297       *nopush_entry = __ pc();
 2298       entries.append(*nopush_entry);
 2299       BLOCK_COMMENT("Entry:");
 2300     }
 2301 
 2302      // Empty array:  Nothing to do.
 2303     __ cbz(count, L_done);
 2304     __ push(RegSet::of(r19, r20, r21, r22), sp);
 2305 
 2306 #ifdef ASSERT
 2307     BLOCK_COMMENT("assert consistent ckoff/ckval");
 2308     // The ckoff and ckval must be mutually consistent,
 2309     // even though caller generates both.
 2310     { Label L;
 2311       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2312       __ ldrw(start_to, Address(ckval, sco_offset));
 2313       __ cmpw(ckoff, start_to);
 2314       __ br(Assembler::EQ, L);
 2315       __ stop("super_check_offset inconsistent");
 2316       __ bind(L);
 2317     }
 2318 #endif //ASSERT
 2319 
 2320     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 2321     bool is_oop = true;
 2322     int element_size = UseCompressedOops ? 4 : 8;
 2323     if (dest_uninitialized) {
 2324       decorators |= IS_DEST_UNINITIALIZED;
 2325     }
 2326 
 2327     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2328     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 2329 
 2330     // save the original count
 2331     __ mov(count_save, count);
 2332 
 2333     // Copy from low to high addresses
 2334     __ mov(start_to, to);              // Save destination array start address
 2335     __ b(L_load_element);
 2336 
 2337     // ======== begin loop ========
 2338     // (Loop is rotated; its entry is L_load_element.)
 2339     // Loop control:
 2340     //   for (; count != 0; count--) {
 2341     //     copied_oop = load_heap_oop(from++);
 2342     //     ... generate_type_check ...;
 2343     //     store_heap_oop(to++, copied_oop);
 2344     //   }
 2345     __ align(OptoLoopAlignment);
 2346 
 2347     __ BIND(L_store_element);
 2348     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 2349                       __ post(to, element_size), copied_oop, noreg,
 2350                       gct1, gct2, gct3);
 2351     __ sub(count, count, 1);
 2352     __ cbz(count, L_do_card_marks);
 2353 
 2354     // ======== loop entry is here ========
 2355     __ BIND(L_load_element);
 2356     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 2357                      copied_oop, noreg, __ post(from, element_size),
 2358                      gct1);
 2359     __ cbz(copied_oop, L_store_element);
 2360 
 2361     __ load_klass(r19_klass, copied_oop);// query the object klass
 2362 
 2363     BLOCK_COMMENT("type_check:");
 2364     generate_type_check(/*sub_klass*/r19_klass,
 2365                         /*super_check_offset*/ckoff,
 2366                         /*super_klass*/ckval,
 2367                         /*r_array_base*/gct1,
 2368                         /*temp2*/gct2,
 2369                         /*result*/r10, L_store_element);
 2370 
 2371     // Fall through on failure!
 2372 
 2373     // ======== end loop ========
 2374 
 2375     // It was a real error; we must depend on the caller to finish the job.
 2376     // Register count = remaining oops, count_orig = total oops.
 2377     // Emit GC store barriers for the oops we have copied and report
 2378     // their number to the caller.
 2379 
 2380     __ subs(count, count_save, count);     // K = partially copied oop count
 2381     __ eon(count, count, zr);              // report (-1^K) to caller
 2382     __ br(Assembler::EQ, L_done_pop);
 2383 
 2384     __ BIND(L_do_card_marks);
 2385     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2386 
 2387     __ bind(L_done_pop);
 2388     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2389     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2390 
 2391     __ bind(L_done);
 2392     __ mov(r0, count);
 2393     __ leave();
 2394     __ ret(lr);
 2395 
 2396     // record the stub entry and end plus any no_push entry
 2397     store_archive_data(stub_id, start, __ pc() , entries_ptr);
 2398     return start;
 2399   }
 2400 
 2401   // Perform range checks on the proposed arraycopy.
 2402   // Kills temp, but nothing else.
 2403   // Also, clean the sign bits of src_pos and dst_pos.
 2404   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2405                               Register src_pos, // source position (c_rarg1)
 2406                               Register dst,     // destination array oo (c_rarg2)
 2407                               Register dst_pos, // destination position (c_rarg3)
 2408                               Register length,
 2409                               Register temp,
 2410                               Label& L_failed) {
 2411     BLOCK_COMMENT("arraycopy_range_checks:");
 2412 
 2413     assert_different_registers(rscratch1, temp);
 2414 
 2415     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2416     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2417     __ addw(temp, length, src_pos);
 2418     __ cmpw(temp, rscratch1);
 2419     __ br(Assembler::HI, L_failed);
 2420 
 2421     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2422     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2423     __ addw(temp, length, dst_pos);
 2424     __ cmpw(temp, rscratch1);
 2425     __ br(Assembler::HI, L_failed);
 2426 
 2427     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2428     __ movw(src_pos, src_pos);
 2429     __ movw(dst_pos, dst_pos);
 2430 
 2431     BLOCK_COMMENT("arraycopy_range_checks done");
 2432   }
 2433 
 2434   // These stubs get called from some dumb test routine.
 2435   // I'll write them properly when they're called from
 2436   // something that's actually doing something.
 2437   static void fake_arraycopy_stub(address src, address dst, int count) {
 2438     assert(count == 0, "huh?");
 2439   }
 2440 
 2441 
 2442   //
 2443   //  Generate 'unsafe' array copy stub
 2444   //  Though just as safe as the other stubs, it takes an unscaled
 2445   //  size_t argument instead of an element count.
 2446   //
 2447   //  Input:
 2448   //    c_rarg0   - source array address
 2449   //    c_rarg1   - destination array address
 2450   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2451   //
 2452   // Examines the alignment of the operands and dispatches
 2453   // to a long, int, short, or byte copy loop.
 2454   //
 2455   address generate_unsafe_copy(address byte_copy_entry,
 2456                                address short_copy_entry,
 2457                                address int_copy_entry,
 2458                                address long_copy_entry) {
 2459     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2460     int entry_count = StubInfo::entry_count(stub_id);
 2461     assert(entry_count == 1, "sanity check");
 2462     address start = load_archive_data(stub_id);
 2463     if (start != nullptr) {
 2464       return start;
 2465     }
 2466     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2467     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2468 
 2469     __ align(CodeEntryAlignment);
 2470     StubCodeMark mark(this, stub_id);
 2471     start = __ pc();
 2472     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2473 
 2474     // bump this on entry, not on exit:
 2475     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2476 
 2477     __ orr(rscratch1, s, d);
 2478     __ orr(rscratch1, rscratch1, count);
 2479 
 2480     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2481     __ cbz(rscratch1, L_long_aligned);
 2482     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2483     __ cbz(rscratch1, L_int_aligned);
 2484     __ tbz(rscratch1, 0, L_short_aligned);
 2485     __ b(RuntimeAddress(byte_copy_entry));
 2486 
 2487     __ BIND(L_short_aligned);
 2488     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2489     __ b(RuntimeAddress(short_copy_entry));
 2490     __ BIND(L_int_aligned);
 2491     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2492     __ b(RuntimeAddress(int_copy_entry));
 2493     __ BIND(L_long_aligned);
 2494     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2495     __ b(RuntimeAddress(long_copy_entry));
 2496 
 2497     // record the stub entry and end
 2498     store_archive_data(stub_id, start, __ pc());
 2499 
 2500     return start;
 2501   }
 2502 
 2503   //
 2504   //  Generate generic array copy stubs
 2505   //
 2506   //  Input:
 2507   //    c_rarg0    -  src oop
 2508   //    c_rarg1    -  src_pos (32-bits)
 2509   //    c_rarg2    -  dst oop
 2510   //    c_rarg3    -  dst_pos (32-bits)
 2511   //    c_rarg4    -  element count (32-bits)
 2512   //
 2513   //  Output:
 2514   //    r0 ==  0  -  success
 2515   //    r0 == -1^K - failure, where K is partial transfer count
 2516   //
 2517   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2518                                 address int_copy_entry, address oop_copy_entry,
 2519                                 address long_copy_entry, address checkcast_copy_entry) {
 2520     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2521     int entry_count = StubInfo::entry_count(stub_id);
 2522     assert(entry_count == 1, "sanity check");
 2523     address start = load_archive_data(stub_id);
 2524     if (start != nullptr) {
 2525       return start;
 2526     }
 2527     Label L_failed, L_objArray;
 2528     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2529 
 2530     // Input registers
 2531     const Register src        = c_rarg0;  // source array oop
 2532     const Register src_pos    = c_rarg1;  // source position
 2533     const Register dst        = c_rarg2;  // destination array oop
 2534     const Register dst_pos    = c_rarg3;  // destination position
 2535     const Register length     = c_rarg4;
 2536 
 2537 
 2538     // Registers used as temps
 2539     const Register dst_klass  = c_rarg5;
 2540 
 2541     __ align(CodeEntryAlignment);
 2542 
 2543     StubCodeMark mark(this, stub_id);
 2544 
 2545     start = __ pc();
 2546 
 2547     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2548 
 2549     // bump this on entry, not on exit:
 2550     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2551 
 2552     //-----------------------------------------------------------------------
 2553     // Assembler stub will be used for this call to arraycopy
 2554     // if the following conditions are met:
 2555     //
 2556     // (1) src and dst must not be null.
 2557     // (2) src_pos must not be negative.
 2558     // (3) dst_pos must not be negative.
 2559     // (4) length  must not be negative.
 2560     // (5) src klass and dst klass should be the same and not null.
 2561     // (6) src and dst should be arrays.
 2562     // (7) src_pos + length must not exceed length of src.
 2563     // (8) dst_pos + length must not exceed length of dst.
 2564     //
 2565 
 2566     //  if (src == nullptr) return -1;
 2567     __ cbz(src, L_failed);
 2568 
 2569     //  if (src_pos < 0) return -1;
 2570     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2571 
 2572     //  if (dst == nullptr) return -1;
 2573     __ cbz(dst, L_failed);
 2574 
 2575     //  if (dst_pos < 0) return -1;
 2576     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2577 
 2578     // registers used as temp
 2579     const Register scratch_length    = r16; // elements count to copy
 2580     const Register scratch_src_klass = r17; // array klass
 2581     const Register lh                = r15; // layout helper
 2582 
 2583     //  if (length < 0) return -1;
 2584     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2585     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2586 
 2587     __ load_klass(scratch_src_klass, src);
 2588 #ifdef ASSERT
 2589     //  assert(src->klass() != nullptr);
 2590     {
 2591       BLOCK_COMMENT("assert klasses not null {");
 2592       Label L1, L2;
 2593       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2594       __ bind(L1);
 2595       __ stop("broken null klass");
 2596       __ bind(L2);
 2597       __ load_klass(rscratch1, dst);
 2598       __ cbz(rscratch1, L1);     // this would be broken also
 2599       BLOCK_COMMENT("} assert klasses not null done");
 2600     }
 2601 #endif
 2602 
 2603     // Load layout helper (32-bits)
 2604     //
 2605     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2606     // 32        30    24            16              8     2                 0
 2607     //
 2608     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2609     //
 2610 
 2611     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2612 
 2613     // Handle objArrays completely differently...
 2614     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2615     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2616     __ movw(rscratch1, objArray_lh);
 2617     __ eorw(rscratch2, lh, rscratch1);
 2618     __ cbzw(rscratch2, L_objArray);
 2619 
 2620     //  if (src->klass() != dst->klass()) return -1;
 2621     __ load_klass(rscratch2, dst);
 2622     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2623     __ cbnz(rscratch2, L_failed);
 2624 
 2625     // Check for flat inline type array -> return -1
 2626     __ test_flat_array_oop(src, rscratch2, L_failed);
 2627 
 2628     // Check for null-free (non-flat) inline type array -> handle as object array
 2629     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2630 
 2631     //  if (!src->is_Array()) return -1;
 2632     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2633 
 2634     // At this point, it is known to be a typeArray (array_tag 0x3).
 2635 #ifdef ASSERT
 2636     {
 2637       BLOCK_COMMENT("assert primitive array {");
 2638       Label L;
 2639       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2640       __ cmpw(lh, rscratch2);
 2641       __ br(Assembler::GE, L);
 2642       __ stop("must be a primitive array");
 2643       __ bind(L);
 2644       BLOCK_COMMENT("} assert primitive array done");
 2645     }
 2646 #endif
 2647 
 2648     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2649                            rscratch2, L_failed);
 2650 
 2651     // TypeArrayKlass
 2652     //
 2653     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2654     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2655     //
 2656 
 2657     const Register rscratch1_offset = rscratch1;    // array offset
 2658     const Register r15_elsize = lh; // element size
 2659 
 2660     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2661            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2662     __ add(src, src, rscratch1_offset);           // src array offset
 2663     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2664     BLOCK_COMMENT("choose copy loop based on element size");
 2665 
 2666     // next registers should be set before the jump to corresponding stub
 2667     const Register from     = c_rarg0;  // source array address
 2668     const Register to       = c_rarg1;  // destination array address
 2669     const Register count    = c_rarg2;  // elements count
 2670 
 2671     // 'from', 'to', 'count' registers should be set in such order
 2672     // since they are the same as 'src', 'src_pos', 'dst'.
 2673 
 2674     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2675 
 2676     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2677     // size in bytes).  We do a simple bitwise binary search.
 2678   __ BIND(L_copy_bytes);
 2679     __ tbnz(r15_elsize, 1, L_copy_ints);
 2680     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2681     __ lea(from, Address(src, src_pos));// src_addr
 2682     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2683     __ movw(count, scratch_length); // length
 2684     __ b(RuntimeAddress(byte_copy_entry));
 2685 
 2686   __ BIND(L_copy_shorts);
 2687     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2688     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2689     __ movw(count, scratch_length); // length
 2690     __ b(RuntimeAddress(short_copy_entry));
 2691 
 2692   __ BIND(L_copy_ints);
 2693     __ tbnz(r15_elsize, 0, L_copy_longs);
 2694     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2695     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2696     __ movw(count, scratch_length); // length
 2697     __ b(RuntimeAddress(int_copy_entry));
 2698 
 2699   __ BIND(L_copy_longs);
 2700 #ifdef ASSERT
 2701     {
 2702       BLOCK_COMMENT("assert long copy {");
 2703       Label L;
 2704       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2705       __ cmpw(r15_elsize, LogBytesPerLong);
 2706       __ br(Assembler::EQ, L);
 2707       __ stop("must be long copy, but elsize is wrong");
 2708       __ bind(L);
 2709       BLOCK_COMMENT("} assert long copy done");
 2710     }
 2711 #endif
 2712     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2713     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2714     __ movw(count, scratch_length); // length
 2715     __ b(RuntimeAddress(long_copy_entry));
 2716 
 2717     // ObjArrayKlass
 2718   __ BIND(L_objArray);
 2719     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2720 
 2721     Label L_plain_copy, L_checkcast_copy;
 2722     //  test array classes for subtyping
 2723     __ load_klass(r15, dst);
 2724     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2725     __ br(Assembler::NE, L_checkcast_copy);
 2726 
 2727     // Identically typed arrays can be copied without element-wise checks.
 2728     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2729                            rscratch2, L_failed);
 2730 
 2731     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2732     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2733     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2734     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2735     __ movw(count, scratch_length); // length
 2736   __ BIND(L_plain_copy);
 2737     __ b(RuntimeAddress(oop_copy_entry));
 2738 
 2739   __ BIND(L_checkcast_copy);
 2740     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2741     {
 2742       // Before looking at dst.length, make sure dst is also an objArray.
 2743       __ ldrw(rscratch1, Address(r15, lh_offset));
 2744       __ movw(rscratch2, objArray_lh);
 2745       __ eorw(rscratch1, rscratch1, rscratch2);
 2746       __ cbnzw(rscratch1, L_failed);
 2747 
 2748       // It is safe to examine both src.length and dst.length.
 2749       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2750                              r15, L_failed);
 2751 
 2752       __ load_klass(dst_klass, dst); // reload
 2753 
 2754       // Marshal the base address arguments now, freeing registers.
 2755       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2756       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2757       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2758       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2759       __ movw(count, length);           // length (reloaded)
 2760       Register sco_temp = c_rarg3;      // this register is free now
 2761       assert_different_registers(from, to, count, sco_temp,
 2762                                  dst_klass, scratch_src_klass);
 2763       // assert_clean_int(count, sco_temp);
 2764 
 2765       // Generate the type check.
 2766       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2767       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2768 
 2769       // Smashes rscratch1, rscratch2
 2770       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2771                           L_plain_copy);
 2772 
 2773       // Fetch destination element klass from the ObjArrayKlass header.
 2774       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2775       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2776       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2777 
 2778       // the checkcast_copy loop needs two extra arguments:
 2779       assert(c_rarg3 == sco_temp, "#3 already in place");
 2780       // Set up arguments for checkcast_copy_entry.
 2781       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2782       __ b(RuntimeAddress(checkcast_copy_entry));
 2783     }
 2784 
 2785   __ BIND(L_failed);
 2786     __ mov(r0, -1);
 2787     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2788     __ ret(lr);
 2789 
 2790     // record the stub entry and end
 2791     store_archive_data(stub_id, start, __ pc());
 2792 
 2793     return start;
 2794   }
 2795 
 2796   //
 2797   // Generate stub for array fill. If "aligned" is true, the
 2798   // "to" address is assumed to be heapword aligned.
 2799   //
 2800   // Arguments for generated stub:
 2801   //   to:    c_rarg0
 2802   //   value: c_rarg1
 2803   //   count: c_rarg2 treated as signed
 2804   //
 2805   address generate_fill(StubId stub_id) {
 2806     BasicType t;
 2807     bool aligned;
 2808 
 2809     switch (stub_id) {
 2810     case StubId::stubgen_jbyte_fill_id:
 2811       t = T_BYTE;
 2812       aligned = false;
 2813       break;
 2814     case StubId::stubgen_jshort_fill_id:
 2815       t = T_SHORT;
 2816       aligned = false;
 2817       break;
 2818     case StubId::stubgen_jint_fill_id:
 2819       t = T_INT;
 2820       aligned = false;
 2821       break;
 2822     case StubId::stubgen_arrayof_jbyte_fill_id:
 2823       t = T_BYTE;
 2824       aligned = true;
 2825       break;
 2826     case StubId::stubgen_arrayof_jshort_fill_id:
 2827       t = T_SHORT;
 2828       aligned = true;
 2829       break;
 2830     case StubId::stubgen_arrayof_jint_fill_id:
 2831       t = T_INT;
 2832       aligned = true;
 2833       break;
 2834     default:
 2835       ShouldNotReachHere();
 2836     };
 2837     int entry_count = StubInfo::entry_count(stub_id);
 2838     assert(entry_count == 1, "sanity check");
 2839     address start = load_archive_data(stub_id);
 2840     if (start != nullptr) {
 2841       return start;
 2842     }
 2843     __ align(CodeEntryAlignment);
 2844     StubCodeMark mark(this, stub_id);
 2845     start = __ pc();
 2846 
 2847     BLOCK_COMMENT("Entry:");
 2848 
 2849     const Register to        = c_rarg0;  // source array address
 2850     const Register value     = c_rarg1;  // value
 2851     const Register count     = c_rarg2;  // elements count
 2852 
 2853     const Register bz_base = r10;        // base for block_zero routine
 2854     const Register cnt_words = r11;      // temp register
 2855 
 2856     __ enter();
 2857 
 2858     Label L_fill_elements, L_exit1;
 2859 
 2860     int shift = -1;
 2861     switch (t) {
 2862       case T_BYTE:
 2863         shift = 0;
 2864         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2865         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2866         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2867         __ br(Assembler::LO, L_fill_elements);
 2868         break;
 2869       case T_SHORT:
 2870         shift = 1;
 2871         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2872         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2873         __ br(Assembler::LO, L_fill_elements);
 2874         break;
 2875       case T_INT:
 2876         shift = 2;
 2877         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2878         __ br(Assembler::LO, L_fill_elements);
 2879         break;
 2880       default: ShouldNotReachHere();
 2881     }
 2882 
 2883     // Align source address at 8 bytes address boundary.
 2884     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2885     if (!aligned) {
 2886       switch (t) {
 2887         case T_BYTE:
 2888           // One byte misalignment happens only for byte arrays.
 2889           __ tbz(to, 0, L_skip_align1);
 2890           __ strb(value, Address(__ post(to, 1)));
 2891           __ subw(count, count, 1);
 2892           __ bind(L_skip_align1);
 2893           // Fallthrough
 2894         case T_SHORT:
 2895           // Two bytes misalignment happens only for byte and short (char) arrays.
 2896           __ tbz(to, 1, L_skip_align2);
 2897           __ strh(value, Address(__ post(to, 2)));
 2898           __ subw(count, count, 2 >> shift);
 2899           __ bind(L_skip_align2);
 2900           // Fallthrough
 2901         case T_INT:
 2902           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2903           __ tbz(to, 2, L_skip_align4);
 2904           __ strw(value, Address(__ post(to, 4)));
 2905           __ subw(count, count, 4 >> shift);
 2906           __ bind(L_skip_align4);
 2907           break;
 2908         default: ShouldNotReachHere();
 2909       }
 2910     }
 2911 
 2912     //
 2913     //  Fill large chunks
 2914     //
 2915     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2916     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2917     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2918     if (UseBlockZeroing) {
 2919       Label non_block_zeroing, rest;
 2920       // If the fill value is zero we can use the fast zero_words().
 2921       __ cbnz(value, non_block_zeroing);
 2922       __ mov(bz_base, to);
 2923       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2924       address tpc = __ zero_words(bz_base, cnt_words);
 2925       if (tpc == nullptr) {
 2926         fatal("CodeCache is full at generate_fill");
 2927       }
 2928       __ b(rest);
 2929       __ bind(non_block_zeroing);
 2930       __ fill_words(to, cnt_words, value);
 2931       __ bind(rest);
 2932     } else {
 2933       __ fill_words(to, cnt_words, value);
 2934     }
 2935 
 2936     // Remaining count is less than 8 bytes. Fill it by a single store.
 2937     // Note that the total length is no less than 8 bytes.
 2938     if (t == T_BYTE || t == T_SHORT) {
 2939       Label L_exit1;
 2940       __ cbzw(count, L_exit1);
 2941       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2942       __ str(value, Address(to, -8));    // overwrite some elements
 2943       __ bind(L_exit1);
 2944       __ leave();
 2945       __ ret(lr);
 2946     }
 2947 
 2948     // Handle copies less than 8 bytes.
 2949     Label L_fill_2, L_fill_4, L_exit2;
 2950     __ bind(L_fill_elements);
 2951     switch (t) {
 2952       case T_BYTE:
 2953         __ tbz(count, 0, L_fill_2);
 2954         __ strb(value, Address(__ post(to, 1)));
 2955         __ bind(L_fill_2);
 2956         __ tbz(count, 1, L_fill_4);
 2957         __ strh(value, Address(__ post(to, 2)));
 2958         __ bind(L_fill_4);
 2959         __ tbz(count, 2, L_exit2);
 2960         __ strw(value, Address(to));
 2961         break;
 2962       case T_SHORT:
 2963         __ tbz(count, 0, L_fill_4);
 2964         __ strh(value, Address(__ post(to, 2)));
 2965         __ bind(L_fill_4);
 2966         __ tbz(count, 1, L_exit2);
 2967         __ strw(value, Address(to));
 2968         break;
 2969       case T_INT:
 2970         __ cbzw(count, L_exit2);
 2971         __ strw(value, Address(to));
 2972         break;
 2973       default: ShouldNotReachHere();
 2974     }
 2975     __ bind(L_exit2);
 2976     __ leave();
 2977     __ ret(lr);
 2978 
 2979     // record the stub entry and end
 2980     store_archive_data(stub_id, start, __ pc());
 2981 
 2982     return start;
 2983   }
 2984 
 2985   address generate_unsafecopy_common_error_exit() {
 2986     StubId stub_id = StubId::stubgen_unsafecopy_common_id;
 2987     int entry_count = StubInfo::entry_count(stub_id);
 2988     assert(entry_count == 1, "sanity check");
 2989     address start = load_archive_data(stub_id);
 2990     if (start != nullptr) {
 2991       return start;
 2992     }
 2993     __ align(CodeEntryAlignment);
 2994     StubCodeMark mark(this, stub_id);
 2995     start = __ pc();
 2996       __ leave();
 2997       __ mov(r0, 0);
 2998       __ ret(lr);
 2999 
 3000     // record the stub entry and end
 3001     store_archive_data(stub_id, start, __ pc());
 3002 
 3003     return start;
 3004   }
 3005 
 3006   //
 3007   //  Generate 'unsafe' set memory stub
 3008   //  Though just as safe as the other stubs, it takes an unscaled
 3009   //  size_t (# bytes) argument instead of an element count.
 3010   //
 3011   //  This fill operation is atomicity preserving: as long as the
 3012   //  address supplied is sufficiently aligned, all writes of up to 64
 3013   //  bits in size are single-copy atomic.
 3014   //
 3015   //  Input:
 3016   //    c_rarg0   - destination array address
 3017   //    c_rarg1   - byte count (size_t)
 3018   //    c_rarg2   - byte value
 3019   //
 3020   address generate_unsafe_setmemory() {
 3021     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
 3022     int entry_count = StubInfo::entry_count(stub_id);
 3023     assert(entry_count == 1, "sanity check");
 3024     // we expect one set of extra unsafememory access handler entries
 3025     GrowableArray<address> extras;
 3026     int extra_count =  1 * UnsafeMemoryAccess::COLUMN_COUNT;
 3027     address start = load_archive_data(stub_id, nullptr, &extras);
 3028     if (start != nullptr) {
 3029       assert(extras.length() == extra_count,
 3030              "unexpected extra entry count %d", extras.length());
 3031       register_unsafe_access_handlers(extras, 0, 1);
 3032       return start;
 3033     }
 3034 
 3035     __ align(CodeEntryAlignment);
 3036     StubCodeMark mark(this, stub_id);
 3037     start = __ pc();
 3038 
 3039     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 3040     Label tail;
 3041 
 3042     {
 3043     UnsafeMemoryAccessMark umam(this, true, false);
 3044 
 3045     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3046 
 3047     __ dup(v0, __ T16B, value);
 3048 
 3049     if (AvoidUnalignedAccesses) {
 3050       __ cmp(count, (u1)16);
 3051       __ br(__ LO, tail);
 3052 
 3053       __ mov(rscratch1, 16);
 3054       __ andr(rscratch2, dest, 15);
 3055       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 3056       __ strq(v0, Address(dest));
 3057       __ sub(count, count, rscratch1);
 3058       __ add(dest, dest, rscratch1);
 3059     }
 3060 
 3061     __ subs(count, count, (u1)64);
 3062     __ br(__ LO, tail);
 3063     {
 3064       Label again;
 3065       __ bind(again);
 3066       __ stpq(v0, v0, Address(dest));
 3067       __ stpq(v0, v0, Address(dest, 32));
 3068 
 3069       __ subs(count, count, 64);
 3070       __ add(dest, dest, 64);
 3071       __ br(__ HS, again);
 3072     }
 3073 
 3074     __ bind(tail);
 3075     // The count of bytes is off by 64, but we don't need to correct
 3076     // it because we're only going to use the least-significant few
 3077     // count bits from here on.
 3078     // __ add(count, count, 64);
 3079 
 3080     {
 3081       Label dont;
 3082       __ tbz(count, exact_log2(32), dont);
 3083       __ stpq(v0, v0, __ post(dest, 32));
 3084       __ bind(dont);
 3085     }
 3086     {
 3087       Label dont;
 3088       __ tbz(count, exact_log2(16), dont);
 3089       __ strq(v0, __ post(dest, 16));
 3090       __ bind(dont);
 3091     }
 3092     {
 3093       Label dont;
 3094       __ tbz(count, exact_log2(8), dont);
 3095       __ strd(v0, __ post(dest, 8));
 3096       __ bind(dont);
 3097     }
 3098 
 3099     Label finished;
 3100     __ tst(count, 7);
 3101     __ br(__ EQ, finished);
 3102 
 3103     {
 3104       Label dont;
 3105       __ tbz(count, exact_log2(4), dont);
 3106       __ strs(v0, __ post(dest, 4));
 3107       __ bind(dont);
 3108     }
 3109     {
 3110       Label dont;
 3111       __ tbz(count, exact_log2(2), dont);
 3112       __ bfi(value, value, 8, 8);
 3113       __ strh(value, __ post(dest, 2));
 3114       __ bind(dont);
 3115     }
 3116     {
 3117       Label dont;
 3118       __ tbz(count, exact_log2(1), dont);
 3119       __ strb(value, Address(dest));
 3120       __ bind(dont);
 3121     }
 3122 
 3123     __ bind(finished);
 3124     __ leave();
 3125     __ ret(lr);
 3126     // have to exit the block and destroy the UnsafeMemoryAccessMark
 3127     // in order to retrieve the handler end address
 3128     }
 3129 
 3130     // install saved handler addresses in extras
 3131     address end = __ pc();
 3132     retrieve_unsafe_access_handlers(start, end, extras);
 3133     assert(extras.length() == extra_count,
 3134            "incorrect handlers count %d", extras.length());
 3135     // record the stub entry and end plus the extras
 3136     store_archive_data(stub_id, start, end, nullptr, &extras);
 3137 
 3138     return start;
 3139   }
 3140 
 3141   address generate_data_cache_writeback() {
 3142     const Register line        = c_rarg0;  // address of line to write back
 3143 
 3144     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 3145     int entry_count = StubInfo::entry_count(stub_id);
 3146     assert(entry_count == 1, "sanity check");
 3147     address start = load_archive_data(stub_id);
 3148     if (start != nullptr) {
 3149       return start;
 3150     }
 3151     __ align(CodeEntryAlignment);
 3152     StubCodeMark mark(this, stub_id);
 3153 
 3154     start = __ pc();
 3155     __ enter();
 3156     __ cache_wb(Address(line, 0));
 3157     __ leave();
 3158     __ ret(lr);
 3159 
 3160     // record the stub entry and end
 3161     store_archive_data(stub_id, start, __ pc());
 3162 
 3163     return start;
 3164   }
 3165 
 3166   address generate_data_cache_writeback_sync() {
 3167     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 3168     int entry_count = StubInfo::entry_count(stub_id);
 3169     assert(entry_count == 1, "sanity check");
 3170     address start = load_archive_data(stub_id);
 3171     if (start != nullptr) {
 3172       return start;
 3173     }
 3174     const Register is_pre     = c_rarg0;  // pre or post sync
 3175     __ align(CodeEntryAlignment);
 3176     StubCodeMark mark(this, stub_id);
 3177 
 3178     // pre wbsync is a no-op
 3179     // post wbsync translates to an sfence
 3180 
 3181     Label skip;
 3182     start = __ pc();
 3183     __ enter();
 3184     __ cbnz(is_pre, skip);
 3185     __ cache_wbsync(false);
 3186     __ bind(skip);
 3187     __ leave();
 3188     __ ret(lr);
 3189 
 3190     // record the stub entry and end
 3191     store_archive_data(stub_id, start, __ pc());
 3192 
 3193     return start;
 3194   }
 3195 
 3196   void generate_arraycopy_stubs() {
 3197     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 3198     // entry immediately following their stack push. This can be used
 3199     // as a post-push branch target for compatible stubs when they
 3200     // identify a special case that can be handled by the fallback
 3201     // stub e.g a disjoint copy stub may be use as a special case
 3202     // fallback for its compatible conjoint copy stub.
 3203     //
 3204     // A no push entry is always returned in the following local and
 3205     // then published by assigning to the appropriate entry field in
 3206     // class StubRoutines. The entry value is then passed to the
 3207     // generator for the compatible stub. That means the entry must be
 3208     // listed when saving to/restoring from the AOT cache, ensuring
 3209     // that the inter-stub jumps are noted at AOT-cache save and
 3210     // relocated at AOT cache load.
 3211     address nopush_entry;
 3212 
 3213     // generate the common exit first so later stubs can rely on it if
 3214     // they want an UnsafeMemoryAccess exit non-local to the stub
 3215     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 3216     // register the stub as the default exit with class UnsafeMemoryAccess
 3217     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 3218 
 3219     // generate and publish arch64-specific bulk copy routines first
 3220     // so we can call them from other copy stubs
 3221     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3222     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3223 
 3224     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3225     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3226 
 3227     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3228     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3229 
 3230     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 3231 
 3232     //*** jbyte
 3233     // Always need aligned and unaligned versions
 3234     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3235     // disjoint nopush entry is needed by conjoint copy
 3236     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3237     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 3238     // conjoint nopush entry is needed by generic/unsafe copy
 3239     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 3240     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3241     // disjoint arrayof nopush entry is needed by conjoint copy
 3242     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3243     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 3244 
 3245     //*** jshort
 3246     // Always need aligned and unaligned versions
 3247     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 3248     // disjoint nopush entry is needed by conjoint copy
 3249     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 3250     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 3251     // conjoint nopush entry is used by generic/unsafe copy
 3252     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 3253     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 3254     // disjoint arrayof nopush entry is needed by conjoint copy
 3255     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 3256     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 3257 
 3258     //*** jint
 3259     // Aligned versions
 3260     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 3261     // disjoint arrayof nopush entry is needed by conjoint copy
 3262     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 3263     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 3264     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 3265     // jint_arraycopy_nopush always points to the unaligned version
 3266     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 3267     // disjoint nopush entry is needed by conjoint copy
 3268     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 3269     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 3270     // conjoint nopush entry is needed by generic/unsafe copy
 3271     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 3272 
 3273     //*** jlong
 3274     // It is always aligned
 3275     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 3276     // disjoint arrayof nopush entry is needed by conjoint copy
 3277     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 3278     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 3279     // conjoint nopush entry is needed by generic/unsafe copy
 3280     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 3281     // disjoint normal/nopush and conjoint normal entries are not
 3282     // generated since the arrayof versions are the same
 3283     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 3284     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 3285     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 3286 
 3287     //*** oops
 3288     {
 3289       StubRoutines::_arrayof_oop_disjoint_arraycopy
 3290         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 3291       // disjoint arrayof nopush entry is needed by conjoint copy
 3292       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 3293       StubRoutines::_arrayof_oop_arraycopy
 3294         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 3295       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 3296       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 3297       // Aligned versions without pre-barriers
 3298       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 3299         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 3300       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 3301       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 3302       // note that we don't need a returned nopush entry because the
 3303       // generic/unsafe copy does not cater for uninit arrays.
 3304       StubRoutines::_arrayof_oop_arraycopy_uninit
 3305         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 3306     }
 3307 
 3308     // for oop copies reuse arrayof entries for non-arrayof cases
 3309     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 3310     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 3311     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 3312     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 3313     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 3314     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 3315 
 3316     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 3317     // checkcast nopush entry is needed by generic copy
 3318     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 3319     // note that we don't need a returned nopush entry because the
 3320     // generic copy does not cater for uninit arrays.
 3321     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 3322 
 3323     // unsafe arraycopy may fallback on conjoint stubs
 3324     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3325                                                               StubRoutines::_jshort_arraycopy_nopush,
 3326                                                               StubRoutines::_jint_arraycopy_nopush,
 3327                                                               StubRoutines::_jlong_arraycopy_nopush);
 3328 
 3329     // generic arraycopy may fallback on conjoint stubs
 3330     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3331                                                                StubRoutines::_jshort_arraycopy_nopush,
 3332                                                                StubRoutines::_jint_arraycopy_nopush,
 3333                                                                StubRoutines::_oop_arraycopy_nopush,
 3334                                                                StubRoutines::_jlong_arraycopy_nopush,
 3335                                                                StubRoutines::_checkcast_arraycopy_nopush);
 3336 
 3337     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 3338     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 3339     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 3340     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 3341     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 3342     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 3343   }
 3344 
 3345   void generate_math_stubs() { Unimplemented(); }
 3346 
 3347   // Arguments:
 3348   //
 3349   // Inputs:
 3350   //   c_rarg0   - source byte array address
 3351   //   c_rarg1   - destination byte array address
 3352   //   c_rarg2   - sessionKe (key) in little endian int array
 3353   //
 3354   address generate_aescrypt_encryptBlock() {
 3355     assert(UseAES, "need AES cryptographic extension support");
 3356     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 3357     int entry_count = StubInfo::entry_count(stub_id);
 3358     assert(entry_count == 1, "sanity check");
 3359     address start = load_archive_data(stub_id);
 3360     if (start != nullptr) {
 3361       return start;
 3362     }
 3363     __ align(CodeEntryAlignment);
 3364     StubCodeMark mark(this, stub_id);
 3365 
 3366     const Register from        = c_rarg0;  // source array address
 3367     const Register to          = c_rarg1;  // destination array address
 3368     const Register key         = c_rarg2;  // key array address
 3369     const Register keylen      = rscratch1;
 3370 
 3371     start = __ pc();
 3372     __ enter();
 3373 
 3374     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3375 
 3376     __ aesenc_loadkeys(key, keylen);
 3377     __ aesecb_encrypt(from, to, keylen);
 3378 
 3379     __ mov(r0, 0);
 3380 
 3381     __ leave();
 3382     __ ret(lr);
 3383 
 3384     // record the stub entry and end
 3385     store_archive_data(stub_id, start, __ pc());
 3386 
 3387     return start;
 3388   }
 3389 
 3390   // Arguments:
 3391   //
 3392   // Inputs:
 3393   //   c_rarg0   - source byte array address
 3394   //   c_rarg1   - destination byte array address
 3395   //   c_rarg2   - sessionKd (key) in little endian int array
 3396   //
 3397   address generate_aescrypt_decryptBlock() {
 3398     assert(UseAES, "need AES cryptographic extension support");
 3399     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 3400     int entry_count = StubInfo::entry_count(stub_id);
 3401     assert(entry_count == 1, "sanity check");
 3402     address start = load_archive_data(stub_id);
 3403     if (start != nullptr) {
 3404       return start;
 3405     }
 3406     __ align(CodeEntryAlignment);
 3407     StubCodeMark mark(this, stub_id);
 3408     Label L_doLast;
 3409 
 3410     const Register from        = c_rarg0;  // source array address
 3411     const Register to          = c_rarg1;  // destination array address
 3412     const Register key         = c_rarg2;  // key array address
 3413     const Register keylen      = rscratch1;
 3414 
 3415     start = __ pc();
 3416     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3417 
 3418     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3419 
 3420     __ aesecb_decrypt(from, to, key, keylen);
 3421 
 3422     __ mov(r0, 0);
 3423 
 3424     __ leave();
 3425     __ ret(lr);
 3426 
 3427     // record the stub entry and end
 3428     store_archive_data(stub_id, start, __ pc());
 3429 
 3430     return start;
 3431   }
 3432 
 3433   // Arguments:
 3434   //
 3435   // Inputs:
 3436   //   c_rarg0   - source byte array address
 3437   //   c_rarg1   - destination byte array address
 3438   //   c_rarg2   - sessionKe (key) in little endian int array
 3439   //   c_rarg3   - r vector byte array address
 3440   //   c_rarg4   - input length
 3441   //
 3442   // Output:
 3443   //   x0        - input length
 3444   //
 3445   address generate_cipherBlockChaining_encryptAESCrypt() {
 3446     assert(UseAES, "need AES cryptographic extension support");
 3447     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 3448     int entry_count = StubInfo::entry_count(stub_id);
 3449     assert(entry_count == 1, "sanity check");
 3450     address start = load_archive_data(stub_id);
 3451     if (start != nullptr) {
 3452       return start;
 3453     }
 3454     __ align(CodeEntryAlignment);
 3455     StubCodeMark mark(this, stub_id);
 3456 
 3457     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3458 
 3459     const Register from        = c_rarg0;  // source array address
 3460     const Register to          = c_rarg1;  // destination array address
 3461     const Register key         = c_rarg2;  // key array address
 3462     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3463                                            // and left with the results of the last encryption block
 3464     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3465     const Register keylen      = rscratch1;
 3466 
 3467     start = __ pc();
 3468 
 3469       __ enter();
 3470 
 3471       __ movw(rscratch2, len_reg);
 3472 
 3473       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3474 
 3475       __ ld1(v0, __ T16B, rvec);
 3476 
 3477       __ cmpw(keylen, 52);
 3478       __ br(Assembler::CC, L_loadkeys_44);
 3479       __ br(Assembler::EQ, L_loadkeys_52);
 3480 
 3481       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3482       __ rev32(v17, __ T16B, v17);
 3483       __ rev32(v18, __ T16B, v18);
 3484     __ BIND(L_loadkeys_52);
 3485       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3486       __ rev32(v19, __ T16B, v19);
 3487       __ rev32(v20, __ T16B, v20);
 3488     __ BIND(L_loadkeys_44);
 3489       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3490       __ rev32(v21, __ T16B, v21);
 3491       __ rev32(v22, __ T16B, v22);
 3492       __ rev32(v23, __ T16B, v23);
 3493       __ rev32(v24, __ T16B, v24);
 3494       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3495       __ rev32(v25, __ T16B, v25);
 3496       __ rev32(v26, __ T16B, v26);
 3497       __ rev32(v27, __ T16B, v27);
 3498       __ rev32(v28, __ T16B, v28);
 3499       __ ld1(v29, v30, v31, __ T16B, key);
 3500       __ rev32(v29, __ T16B, v29);
 3501       __ rev32(v30, __ T16B, v30);
 3502       __ rev32(v31, __ T16B, v31);
 3503 
 3504     __ BIND(L_aes_loop);
 3505       __ ld1(v1, __ T16B, __ post(from, 16));
 3506       __ eor(v0, __ T16B, v0, v1);
 3507 
 3508       __ br(Assembler::CC, L_rounds_44);
 3509       __ br(Assembler::EQ, L_rounds_52);
 3510 
 3511       __ aese(v0, v17); __ aesmc(v0, v0);
 3512       __ aese(v0, v18); __ aesmc(v0, v0);
 3513     __ BIND(L_rounds_52);
 3514       __ aese(v0, v19); __ aesmc(v0, v0);
 3515       __ aese(v0, v20); __ aesmc(v0, v0);
 3516     __ BIND(L_rounds_44);
 3517       __ aese(v0, v21); __ aesmc(v0, v0);
 3518       __ aese(v0, v22); __ aesmc(v0, v0);
 3519       __ aese(v0, v23); __ aesmc(v0, v0);
 3520       __ aese(v0, v24); __ aesmc(v0, v0);
 3521       __ aese(v0, v25); __ aesmc(v0, v0);
 3522       __ aese(v0, v26); __ aesmc(v0, v0);
 3523       __ aese(v0, v27); __ aesmc(v0, v0);
 3524       __ aese(v0, v28); __ aesmc(v0, v0);
 3525       __ aese(v0, v29); __ aesmc(v0, v0);
 3526       __ aese(v0, v30);
 3527       __ eor(v0, __ T16B, v0, v31);
 3528 
 3529       __ st1(v0, __ T16B, __ post(to, 16));
 3530 
 3531       __ subw(len_reg, len_reg, 16);
 3532       __ cbnzw(len_reg, L_aes_loop);
 3533 
 3534       __ st1(v0, __ T16B, rvec);
 3535 
 3536       __ mov(r0, rscratch2);
 3537 
 3538       __ leave();
 3539       __ ret(lr);
 3540 
 3541       // record the stub entry and end
 3542       store_archive_data(stub_id, start, __ pc());
 3543 
 3544       return start;
 3545   }
 3546 
 3547   // Arguments:
 3548   //
 3549   // Inputs:
 3550   //   c_rarg0   - source byte array address
 3551   //   c_rarg1   - destination byte array address
 3552   //   c_rarg2   - sessionKd (key) in little endian int array
 3553   //   c_rarg3   - r vector byte array address
 3554   //   c_rarg4   - input length
 3555   //
 3556   // Output:
 3557   //   r0        - input length
 3558   //
 3559   address generate_cipherBlockChaining_decryptAESCrypt() {
 3560     assert(UseAES, "need AES cryptographic extension support");
 3561     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3562     int entry_count = StubInfo::entry_count(stub_id);
 3563     assert(entry_count == 1, "sanity check");
 3564     address start = load_archive_data(stub_id);
 3565     if (start != nullptr) {
 3566       return start;
 3567     }
 3568     __ align(CodeEntryAlignment);
 3569     StubCodeMark mark(this, stub_id);
 3570 
 3571     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3572 
 3573     const Register from        = c_rarg0;  // source array address
 3574     const Register to          = c_rarg1;  // destination array address
 3575     const Register key         = c_rarg2;  // key array address
 3576     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3577                                            // and left with the results of the last encryption block
 3578     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3579     const Register keylen      = rscratch1;
 3580 
 3581     start = __ pc();
 3582 
 3583       __ enter();
 3584 
 3585       __ movw(rscratch2, len_reg);
 3586 
 3587       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3588 
 3589       __ ld1(v2, __ T16B, rvec);
 3590 
 3591       __ ld1(v31, __ T16B, __ post(key, 16));
 3592       __ rev32(v31, __ T16B, v31);
 3593 
 3594       __ cmpw(keylen, 52);
 3595       __ br(Assembler::CC, L_loadkeys_44);
 3596       __ br(Assembler::EQ, L_loadkeys_52);
 3597 
 3598       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3599       __ rev32(v17, __ T16B, v17);
 3600       __ rev32(v18, __ T16B, v18);
 3601     __ BIND(L_loadkeys_52);
 3602       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3603       __ rev32(v19, __ T16B, v19);
 3604       __ rev32(v20, __ T16B, v20);
 3605     __ BIND(L_loadkeys_44);
 3606       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3607       __ rev32(v21, __ T16B, v21);
 3608       __ rev32(v22, __ T16B, v22);
 3609       __ rev32(v23, __ T16B, v23);
 3610       __ rev32(v24, __ T16B, v24);
 3611       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3612       __ rev32(v25, __ T16B, v25);
 3613       __ rev32(v26, __ T16B, v26);
 3614       __ rev32(v27, __ T16B, v27);
 3615       __ rev32(v28, __ T16B, v28);
 3616       __ ld1(v29, v30, __ T16B, key);
 3617       __ rev32(v29, __ T16B, v29);
 3618       __ rev32(v30, __ T16B, v30);
 3619 
 3620     __ BIND(L_aes_loop);
 3621       __ ld1(v0, __ T16B, __ post(from, 16));
 3622       __ orr(v1, __ T16B, v0, v0);
 3623 
 3624       __ br(Assembler::CC, L_rounds_44);
 3625       __ br(Assembler::EQ, L_rounds_52);
 3626 
 3627       __ aesd(v0, v17); __ aesimc(v0, v0);
 3628       __ aesd(v0, v18); __ aesimc(v0, v0);
 3629     __ BIND(L_rounds_52);
 3630       __ aesd(v0, v19); __ aesimc(v0, v0);
 3631       __ aesd(v0, v20); __ aesimc(v0, v0);
 3632     __ BIND(L_rounds_44);
 3633       __ aesd(v0, v21); __ aesimc(v0, v0);
 3634       __ aesd(v0, v22); __ aesimc(v0, v0);
 3635       __ aesd(v0, v23); __ aesimc(v0, v0);
 3636       __ aesd(v0, v24); __ aesimc(v0, v0);
 3637       __ aesd(v0, v25); __ aesimc(v0, v0);
 3638       __ aesd(v0, v26); __ aesimc(v0, v0);
 3639       __ aesd(v0, v27); __ aesimc(v0, v0);
 3640       __ aesd(v0, v28); __ aesimc(v0, v0);
 3641       __ aesd(v0, v29); __ aesimc(v0, v0);
 3642       __ aesd(v0, v30);
 3643       __ eor(v0, __ T16B, v0, v31);
 3644       __ eor(v0, __ T16B, v0, v2);
 3645 
 3646       __ st1(v0, __ T16B, __ post(to, 16));
 3647       __ orr(v2, __ T16B, v1, v1);
 3648 
 3649       __ subw(len_reg, len_reg, 16);
 3650       __ cbnzw(len_reg, L_aes_loop);
 3651 
 3652       __ st1(v2, __ T16B, rvec);
 3653 
 3654       __ mov(r0, rscratch2);
 3655 
 3656       __ leave();
 3657       __ ret(lr);
 3658 
 3659     // record the stub entry and end
 3660     store_archive_data(stub_id, start, __ pc());
 3661 
 3662     return start;
 3663   }
 3664 
 3665   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3666   // Inputs: 128-bits. in is preserved.
 3667   // The least-significant 64-bit word is in the upper dword of each vector.
 3668   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3669   // Output: result
 3670   void be_add_128_64(FloatRegister result, FloatRegister in,
 3671                      FloatRegister inc, FloatRegister tmp) {
 3672     assert_different_registers(result, tmp, inc);
 3673 
 3674     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3675                                            // input
 3676     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3677     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3678                                            // MSD == 0 (must be!) to LSD
 3679     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3680   }
 3681 
 3682   // CTR AES crypt.
 3683   // Arguments:
 3684   //
 3685   // Inputs:
 3686   //   c_rarg0   - source byte array address
 3687   //   c_rarg1   - destination byte array address
 3688   //   c_rarg2   - sessionKe (key) in little endian int array
 3689   //   c_rarg3   - counter vector byte array address
 3690   //   c_rarg4   - input length
 3691   //   c_rarg5   - saved encryptedCounter start
 3692   //   c_rarg6   - saved used length
 3693   //
 3694   // Output:
 3695   //   r0       - input length
 3696   //
 3697   address generate_counterMode_AESCrypt() {
 3698     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3699     int entry_count = StubInfo::entry_count(stub_id);
 3700     assert(entry_count == 1, "sanity check");
 3701     address start = load_archive_data(stub_id);
 3702     if (start != nullptr) {
 3703       return start;
 3704     }
 3705     const Register in = c_rarg0;
 3706     const Register out = c_rarg1;
 3707     const Register key = c_rarg2;
 3708     const Register counter = c_rarg3;
 3709     const Register saved_len = c_rarg4, len = r10;
 3710     const Register saved_encrypted_ctr = c_rarg5;
 3711     const Register used_ptr = c_rarg6, used = r12;
 3712 
 3713     const Register offset = r7;
 3714     const Register keylen = r11;
 3715 
 3716     const unsigned char block_size = 16;
 3717     const int bulk_width = 4;
 3718     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3719     // performance with larger data sizes, but it also means that the
 3720     // fast path isn't used until you have at least 8 blocks, and up
 3721     // to 127 bytes of data will be executed on the slow path. For
 3722     // that reason, and also so as not to blow away too much icache, 4
 3723     // blocks seems like a sensible compromise.
 3724 
 3725     // Algorithm:
 3726     //
 3727     //    if (len == 0) {
 3728     //        goto DONE;
 3729     //    }
 3730     //    int result = len;
 3731     //    do {
 3732     //        if (used >= blockSize) {
 3733     //            if (len >= bulk_width * blockSize) {
 3734     //                CTR_large_block();
 3735     //                if (len == 0)
 3736     //                    goto DONE;
 3737     //            }
 3738     //            for (;;) {
 3739     //                16ByteVector v0 = counter;
 3740     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3741     //                used = 0;
 3742     //                if (len < blockSize)
 3743     //                    break;    /* goto NEXT */
 3744     //                16ByteVector v1 = load16Bytes(in, offset);
 3745     //                v1 = v1 ^ encryptedCounter;
 3746     //                store16Bytes(out, offset);
 3747     //                used = blockSize;
 3748     //                offset += blockSize;
 3749     //                len -= blockSize;
 3750     //                if (len == 0)
 3751     //                    goto DONE;
 3752     //            }
 3753     //        }
 3754     //      NEXT:
 3755     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3756     //        len--;
 3757     //    } while (len != 0);
 3758     //  DONE:
 3759     //    return result;
 3760     //
 3761     // CTR_large_block()
 3762     //    Wide bulk encryption of whole blocks.
 3763 
 3764     __ align(CodeEntryAlignment);
 3765     StubCodeMark mark(this, stub_id);
 3766     start = __ pc();
 3767     __ enter();
 3768 
 3769     Label DONE, CTR_large_block, large_block_return;
 3770     __ ldrw(used, Address(used_ptr));
 3771     __ cbzw(saved_len, DONE);
 3772 
 3773     __ mov(len, saved_len);
 3774     __ mov(offset, 0);
 3775 
 3776     // Compute #rounds for AES based on the length of the key array
 3777     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3778 
 3779     __ aesenc_loadkeys(key, keylen);
 3780 
 3781     {
 3782       Label L_CTR_loop, NEXT;
 3783 
 3784       __ bind(L_CTR_loop);
 3785 
 3786       __ cmp(used, block_size);
 3787       __ br(__ LO, NEXT);
 3788 
 3789       // Maybe we have a lot of data
 3790       __ subsw(rscratch1, len, bulk_width * block_size);
 3791       __ br(__ HS, CTR_large_block);
 3792       __ BIND(large_block_return);
 3793       __ cbzw(len, DONE);
 3794 
 3795       // Setup the counter
 3796       __ movi(v4, __ T4S, 0);
 3797       __ movi(v5, __ T4S, 1);
 3798       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3799 
 3800       // 128-bit big-endian increment
 3801       __ ld1(v0, __ T16B, counter);
 3802       __ rev64(v16, __ T16B, v0);
 3803       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3804       __ rev64(v16, __ T16B, v16);
 3805       __ st1(v16, __ T16B, counter);
 3806       // Previous counter value is in v0
 3807       // v4 contains { 0, 1 }
 3808 
 3809       {
 3810         // We have fewer than bulk_width blocks of data left. Encrypt
 3811         // them one by one until there is less than a full block
 3812         // remaining, being careful to save both the encrypted counter
 3813         // and the counter.
 3814 
 3815         Label inner_loop;
 3816         __ bind(inner_loop);
 3817         // Counter to encrypt is in v0
 3818         __ aesecb_encrypt(noreg, noreg, keylen);
 3819         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3820 
 3821         // Do we have a remaining full block?
 3822 
 3823         __ mov(used, 0);
 3824         __ cmp(len, block_size);
 3825         __ br(__ LO, NEXT);
 3826 
 3827         // Yes, we have a full block
 3828         __ ldrq(v1, Address(in, offset));
 3829         __ eor(v1, __ T16B, v1, v0);
 3830         __ strq(v1, Address(out, offset));
 3831         __ mov(used, block_size);
 3832         __ add(offset, offset, block_size);
 3833 
 3834         __ subw(len, len, block_size);
 3835         __ cbzw(len, DONE);
 3836 
 3837         // Increment the counter, store it back
 3838         __ orr(v0, __ T16B, v16, v16);
 3839         __ rev64(v16, __ T16B, v16);
 3840         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3841         __ rev64(v16, __ T16B, v16);
 3842         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3843 
 3844         __ b(inner_loop);
 3845       }
 3846 
 3847       __ BIND(NEXT);
 3848 
 3849       // Encrypt a single byte, and loop.
 3850       // We expect this to be a rare event.
 3851       __ ldrb(rscratch1, Address(in, offset));
 3852       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3853       __ eor(rscratch1, rscratch1, rscratch2);
 3854       __ strb(rscratch1, Address(out, offset));
 3855       __ add(offset, offset, 1);
 3856       __ add(used, used, 1);
 3857       __ subw(len, len,1);
 3858       __ cbnzw(len, L_CTR_loop);
 3859     }
 3860 
 3861     __ bind(DONE);
 3862     __ strw(used, Address(used_ptr));
 3863     __ mov(r0, saved_len);
 3864 
 3865     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3866     __ ret(lr);
 3867 
 3868     // Bulk encryption
 3869 
 3870     __ BIND (CTR_large_block);
 3871     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3872 
 3873     if (bulk_width == 8) {
 3874       __ sub(sp, sp, 4 * 16);
 3875       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3876     }
 3877     __ sub(sp, sp, 4 * 16);
 3878     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3879     RegSet saved_regs = (RegSet::of(in, out, offset)
 3880                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3881     __ push(saved_regs, sp);
 3882     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3883     __ add(in, in, offset);
 3884     __ add(out, out, offset);
 3885 
 3886     // Keys should already be loaded into the correct registers
 3887 
 3888     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3889     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3890 
 3891     // AES/CTR loop
 3892     {
 3893       Label L_CTR_loop;
 3894       __ BIND(L_CTR_loop);
 3895 
 3896       // Setup the counters
 3897       __ movi(v8, __ T4S, 0);
 3898       __ movi(v9, __ T4S, 1);
 3899       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3900 
 3901       for (int i = 0; i < bulk_width; i++) {
 3902         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3903         __ rev64(v0_ofs, __ T16B, v16);
 3904         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3905       }
 3906 
 3907       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3908 
 3909       // Encrypt the counters
 3910       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3911 
 3912       if (bulk_width == 8) {
 3913         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3914       }
 3915 
 3916       // XOR the encrypted counters with the inputs
 3917       for (int i = 0; i < bulk_width; i++) {
 3918         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3919         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3920         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3921       }
 3922 
 3923       // Write the encrypted data
 3924       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3925       if (bulk_width == 8) {
 3926         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3927       }
 3928 
 3929       __ subw(len, len, 16 * bulk_width);
 3930       __ cbnzw(len, L_CTR_loop);
 3931     }
 3932 
 3933     // Save the counter back where it goes
 3934     __ rev64(v16, __ T16B, v16);
 3935     __ st1(v16, __ T16B, counter);
 3936 
 3937     __ pop(saved_regs, sp);
 3938 
 3939     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3940     if (bulk_width == 8) {
 3941       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3942     }
 3943 
 3944     __ andr(rscratch1, len, -16 * bulk_width);
 3945     __ sub(len, len, rscratch1);
 3946     __ add(offset, offset, rscratch1);
 3947     __ mov(used, 16);
 3948     __ strw(used, Address(used_ptr));
 3949     __ b(large_block_return);
 3950 
 3951     // record the stub entry and end
 3952     store_archive_data(stub_id, start, __ pc());
 3953 
 3954     return start;
 3955   }
 3956 
 3957   // Vector AES Galois Counter Mode implementation. Parameters:
 3958   //
 3959   // in = c_rarg0
 3960   // len = c_rarg1
 3961   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3962   // out = c_rarg3
 3963   // key = c_rarg4
 3964   // state = c_rarg5 - GHASH.state
 3965   // subkeyHtbl = c_rarg6 - powers of H
 3966   // counter = c_rarg7 - 16 bytes of CTR
 3967   // return - number of processed bytes
 3968   address generate_galoisCounterMode_AESCrypt() {
 3969     Label ghash_polynomial; // local data generated after code
 3970     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3971     int entry_count = StubInfo::entry_count(stub_id);
 3972     assert(entry_count == 1, "sanity check");
 3973     address start = load_archive_data(stub_id);
 3974     if (start != nullptr) {
 3975       return start;
 3976     }
 3977     __ align(CodeEntryAlignment);
 3978     StubCodeMark mark(this, stub_id);
 3979     start = __ pc();
 3980     __ enter();
 3981 
 3982     const Register in = c_rarg0;
 3983     const Register len = c_rarg1;
 3984     const Register ct = c_rarg2;
 3985     const Register out = c_rarg3;
 3986     // and updated with the incremented counter in the end
 3987 
 3988     const Register key = c_rarg4;
 3989     const Register state = c_rarg5;
 3990 
 3991     const Register subkeyHtbl = c_rarg6;
 3992 
 3993     const Register counter = c_rarg7;
 3994 
 3995     const Register keylen = r10;
 3996     // Save state before entering routine
 3997     __ sub(sp, sp, 4 * 16);
 3998     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3999     __ sub(sp, sp, 4 * 16);
 4000     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 4001 
 4002     // __ andr(len, len, -512);
 4003     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 4004     __ str(len, __ pre(sp, -2 * wordSize));
 4005 
 4006     Label DONE;
 4007     __ cbz(len, DONE);
 4008 
 4009     // Compute #rounds for AES based on the length of the key array
 4010     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 4011 
 4012     __ aesenc_loadkeys(key, keylen);
 4013     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 4014     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 4015 
 4016     // AES/CTR loop
 4017     {
 4018       Label L_CTR_loop;
 4019       __ BIND(L_CTR_loop);
 4020 
 4021       // Setup the counters
 4022       __ movi(v8, __ T4S, 0);
 4023       __ movi(v9, __ T4S, 1);
 4024       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 4025 
 4026       assert(v0->encoding() < v8->encoding(), "");
 4027       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 4028         FloatRegister f = as_FloatRegister(i);
 4029         __ rev32(f, __ T16B, v16);
 4030         __ addv(v16, __ T4S, v16, v8);
 4031       }
 4032 
 4033       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 4034 
 4035       // Encrypt the counters
 4036       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 4037 
 4038       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 4039 
 4040       // XOR the encrypted counters with the inputs
 4041       for (int i = 0; i < 8; i++) {
 4042         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 4043         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 4044         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 4045       }
 4046       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 4047       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 4048 
 4049       __ subw(len, len, 16 * 8);
 4050       __ cbnzw(len, L_CTR_loop);
 4051     }
 4052 
 4053     __ rev32(v16, __ T16B, v16);
 4054     __ st1(v16, __ T16B, counter);
 4055 
 4056     __ ldr(len, Address(sp));
 4057     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 4058 
 4059     // GHASH/CTR loop
 4060     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 4061                                 len, /*unrolls*/4);
 4062 
 4063 #ifdef ASSERT
 4064     { Label L;
 4065       __ cmp(len, (unsigned char)0);
 4066       __ br(Assembler::EQ, L);
 4067       __ stop("stubGenerator: abort");
 4068       __ bind(L);
 4069   }
 4070 #endif
 4071 
 4072   __ bind(DONE);
 4073     // Return the number of bytes processed
 4074     __ ldr(r0, __ post(sp, 2 * wordSize));
 4075 
 4076     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 4077     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 4078 
 4079     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4080     __ ret(lr);
 4081 
 4082     // bind label and generate polynomial data
 4083     __ align(wordSize * 2);
 4084     __ bind(ghash_polynomial);
 4085     __ emit_int64(0x87);  // The low-order bits of the field
 4086                           // polynomial (i.e. p = z^7+z^2+z+1)
 4087                           // repeated in the low and high parts of a
 4088                           // 128-bit vector
 4089     __ emit_int64(0x87);
 4090 
 4091     // record the stub entry and end
 4092     store_archive_data(stub_id, start, __ pc());
 4093 
 4094     return start;
 4095   }
 4096 
 4097   class Cached64Bytes {
 4098   private:
 4099     MacroAssembler *_masm;
 4100     Register _regs[8];
 4101 
 4102   public:
 4103     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 4104       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 4105       auto it = rs.begin();
 4106       for (auto &r: _regs) {
 4107         r = *it;
 4108         ++it;
 4109       }
 4110     }
 4111 
 4112     void gen_loads(Register base) {
 4113       for (int i = 0; i < 8; i += 2) {
 4114         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 4115       }
 4116     }
 4117 
 4118     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 4119     void extract_u32(Register dest, int i) {
 4120       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 4121     }
 4122   };
 4123 
 4124   // Utility routines for md5.
 4125   // Clobbers r10 and r11.
 4126   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4127               int k, int s, int t) {
 4128     Register rscratch3 = r10;
 4129     Register rscratch4 = r11;
 4130 
 4131     __ eorw(rscratch3, r3, r4);
 4132     __ movw(rscratch2, t);
 4133     __ andw(rscratch3, rscratch3, r2);
 4134     __ addw(rscratch4, r1, rscratch2);
 4135     reg_cache.extract_u32(rscratch1, k);
 4136     __ eorw(rscratch3, rscratch3, r4);
 4137     __ addw(rscratch4, rscratch4, rscratch1);
 4138     __ addw(rscratch3, rscratch3, rscratch4);
 4139     __ rorw(rscratch2, rscratch3, 32 - s);
 4140     __ addw(r1, rscratch2, r2);
 4141   }
 4142 
 4143   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4144               int k, int s, int t) {
 4145     Register rscratch3 = r10;
 4146     Register rscratch4 = r11;
 4147 
 4148     reg_cache.extract_u32(rscratch1, k);
 4149     __ movw(rscratch2, t);
 4150     __ addw(rscratch4, r1, rscratch2);
 4151     __ addw(rscratch4, rscratch4, rscratch1);
 4152     __ bicw(rscratch2, r3, r4);
 4153     __ andw(rscratch3, r2, r4);
 4154     __ addw(rscratch2, rscratch2, rscratch4);
 4155     __ addw(rscratch2, rscratch2, rscratch3);
 4156     __ rorw(rscratch2, rscratch2, 32 - s);
 4157     __ addw(r1, rscratch2, r2);
 4158   }
 4159 
 4160   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4161               int k, int s, int t) {
 4162     Register rscratch3 = r10;
 4163     Register rscratch4 = r11;
 4164 
 4165     __ eorw(rscratch3, r3, r4);
 4166     __ movw(rscratch2, t);
 4167     __ addw(rscratch4, r1, rscratch2);
 4168     reg_cache.extract_u32(rscratch1, k);
 4169     __ eorw(rscratch3, rscratch3, r2);
 4170     __ addw(rscratch4, rscratch4, rscratch1);
 4171     __ addw(rscratch3, rscratch3, rscratch4);
 4172     __ rorw(rscratch2, rscratch3, 32 - s);
 4173     __ addw(r1, rscratch2, r2);
 4174   }
 4175 
 4176   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4177               int k, int s, int t) {
 4178     Register rscratch3 = r10;
 4179     Register rscratch4 = r11;
 4180 
 4181     __ movw(rscratch3, t);
 4182     __ ornw(rscratch2, r2, r4);
 4183     __ addw(rscratch4, r1, rscratch3);
 4184     reg_cache.extract_u32(rscratch1, k);
 4185     __ eorw(rscratch3, rscratch2, r3);
 4186     __ addw(rscratch4, rscratch4, rscratch1);
 4187     __ addw(rscratch3, rscratch3, rscratch4);
 4188     __ rorw(rscratch2, rscratch3, 32 - s);
 4189     __ addw(r1, rscratch2, r2);
 4190   }
 4191 
 4192   // Arguments:
 4193   //
 4194   // Inputs:
 4195   //   c_rarg0   - byte[]  source+offset
 4196   //   c_rarg1   - int[]   SHA.state
 4197   //   c_rarg2   - int     offset
 4198   //   c_rarg3   - int     limit
 4199   //
 4200   address generate_md5_implCompress(StubId stub_id) {
 4201     bool multi_block;
 4202     switch (stub_id) {
 4203     case StubId::stubgen_md5_implCompress_id:
 4204       multi_block = false;
 4205       break;
 4206     case StubId::stubgen_md5_implCompressMB_id:
 4207       multi_block = true;
 4208       break;
 4209     default:
 4210       ShouldNotReachHere();
 4211     }
 4212     int entry_count = StubInfo::entry_count(stub_id);
 4213     assert(entry_count == 1, "sanity check");
 4214     address start = load_archive_data(stub_id);
 4215     if (start != nullptr) {
 4216       return start;
 4217     }
 4218     __ align(CodeEntryAlignment);
 4219 
 4220     StubCodeMark mark(this, stub_id);
 4221     start = __ pc();
 4222 
 4223     Register buf       = c_rarg0;
 4224     Register state     = c_rarg1;
 4225     Register ofs       = c_rarg2;
 4226     Register limit     = c_rarg3;
 4227     Register a         = r4;
 4228     Register b         = r5;
 4229     Register c         = r6;
 4230     Register d         = r7;
 4231     Register rscratch3 = r10;
 4232     Register rscratch4 = r11;
 4233 
 4234     Register state_regs[2] = { r12, r13 };
 4235     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 4236     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 4237 
 4238     __ push(saved_regs, sp);
 4239 
 4240     __ ldp(state_regs[0], state_regs[1], Address(state));
 4241     __ ubfx(a, state_regs[0],  0, 32);
 4242     __ ubfx(b, state_regs[0], 32, 32);
 4243     __ ubfx(c, state_regs[1],  0, 32);
 4244     __ ubfx(d, state_regs[1], 32, 32);
 4245 
 4246     Label md5_loop;
 4247     __ BIND(md5_loop);
 4248 
 4249     reg_cache.gen_loads(buf);
 4250 
 4251     // Round 1
 4252     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 4253     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 4254     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 4255     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 4256     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 4257     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 4258     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 4259     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 4260     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 4261     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 4262     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 4263     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 4264     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 4265     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 4266     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 4267     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 4268 
 4269     // Round 2
 4270     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 4271     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 4272     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 4273     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 4274     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 4275     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 4276     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 4277     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 4278     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 4279     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 4280     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 4281     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 4282     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 4283     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 4284     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 4285     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 4286 
 4287     // Round 3
 4288     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 4289     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 4290     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 4291     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 4292     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 4293     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 4294     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 4295     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 4296     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 4297     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 4298     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 4299     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 4300     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 4301     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 4302     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 4303     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 4304 
 4305     // Round 4
 4306     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 4307     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 4308     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 4309     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 4310     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 4311     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 4312     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 4313     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 4314     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 4315     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 4316     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 4317     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 4318     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 4319     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 4320     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 4321     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 4322 
 4323     __ addw(a, state_regs[0], a);
 4324     __ ubfx(rscratch2, state_regs[0], 32, 32);
 4325     __ addw(b, rscratch2, b);
 4326     __ addw(c, state_regs[1], c);
 4327     __ ubfx(rscratch4, state_regs[1], 32, 32);
 4328     __ addw(d, rscratch4, d);
 4329 
 4330     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 4331     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 4332 
 4333     if (multi_block) {
 4334       __ add(buf, buf, 64);
 4335       __ add(ofs, ofs, 64);
 4336       __ cmp(ofs, limit);
 4337       __ br(Assembler::LE, md5_loop);
 4338       __ mov(c_rarg0, ofs); // return ofs
 4339     }
 4340 
 4341     // write hash values back in the correct order
 4342     __ stp(state_regs[0], state_regs[1], Address(state));
 4343 
 4344     __ pop(saved_regs, sp);
 4345 
 4346     __ ret(lr);
 4347 
 4348     // record the stub entry and end
 4349     store_archive_data(stub_id, start, __ pc());
 4350 
 4351     return start;
 4352   }
 4353 
 4354   // Arguments:
 4355   //
 4356   // Inputs:
 4357   //   c_rarg0   - byte[]  source+offset
 4358   //   c_rarg1   - int[]   SHA.state
 4359   //   c_rarg2   - int     offset
 4360   //   c_rarg3   - int     limit
 4361   //
 4362   address generate_sha1_implCompress(StubId stub_id) {
 4363     bool multi_block;
 4364     switch (stub_id) {
 4365     case StubId::stubgen_sha1_implCompress_id:
 4366       multi_block = false;
 4367       break;
 4368     case StubId::stubgen_sha1_implCompressMB_id:
 4369       multi_block = true;
 4370       break;
 4371     default:
 4372       ShouldNotReachHere();
 4373     }
 4374     int entry_count = StubInfo::entry_count(stub_id);
 4375     assert(entry_count == 1, "sanity check");
 4376     address start = load_archive_data(stub_id);
 4377     if (start != nullptr) {
 4378       return start;
 4379     }
 4380     __ align(CodeEntryAlignment);
 4381 
 4382     StubCodeMark mark(this, stub_id);
 4383     start = __ pc();
 4384 
 4385     Register buf   = c_rarg0;
 4386     Register state = c_rarg1;
 4387     Register ofs   = c_rarg2;
 4388     Register limit = c_rarg3;
 4389 
 4390     Label keys;
 4391     Label sha1_loop;
 4392 
 4393     // load the keys into v0..v3
 4394     __ adr(rscratch1, keys);
 4395     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 4396     // load 5 words state into v6, v7
 4397     __ ldrq(v6, Address(state, 0));
 4398     __ ldrs(v7, Address(state, 16));
 4399 
 4400 
 4401     __ BIND(sha1_loop);
 4402     // load 64 bytes of data into v16..v19
 4403     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4404     __ rev32(v16, __ T16B, v16);
 4405     __ rev32(v17, __ T16B, v17);
 4406     __ rev32(v18, __ T16B, v18);
 4407     __ rev32(v19, __ T16B, v19);
 4408 
 4409     // do the sha1
 4410     __ addv(v4, __ T4S, v16, v0);
 4411     __ orr(v20, __ T16B, v6, v6);
 4412 
 4413     FloatRegister d0 = v16;
 4414     FloatRegister d1 = v17;
 4415     FloatRegister d2 = v18;
 4416     FloatRegister d3 = v19;
 4417 
 4418     for (int round = 0; round < 20; round++) {
 4419       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 4420       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 4421       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 4422       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 4423       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 4424 
 4425       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 4426       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 4427       __ sha1h(tmp2, __ T4S, v20);
 4428       if (round < 5)
 4429         __ sha1c(v20, __ T4S, tmp3, tmp4);
 4430       else if (round < 10 || round >= 15)
 4431         __ sha1p(v20, __ T4S, tmp3, tmp4);
 4432       else
 4433         __ sha1m(v20, __ T4S, tmp3, tmp4);
 4434       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 4435 
 4436       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4437     }
 4438 
 4439     __ addv(v7, __ T2S, v7, v21);
 4440     __ addv(v6, __ T4S, v6, v20);
 4441 
 4442     if (multi_block) {
 4443       __ add(ofs, ofs, 64);
 4444       __ cmp(ofs, limit);
 4445       __ br(Assembler::LE, sha1_loop);
 4446       __ mov(c_rarg0, ofs); // return ofs
 4447     }
 4448 
 4449     __ strq(v6, Address(state, 0));
 4450     __ strs(v7, Address(state, 16));
 4451 
 4452     __ ret(lr);
 4453 
 4454     __ bind(keys);
 4455     __ emit_int32(0x5a827999);
 4456     __ emit_int32(0x6ed9eba1);
 4457     __ emit_int32(0x8f1bbcdc);
 4458     __ emit_int32(0xca62c1d6);
 4459 
 4460     // record the stub entry and end
 4461     store_archive_data(stub_id, start, __ pc());
 4462 
 4463     return start;
 4464   }
 4465 
 4466 
 4467   // Arguments:
 4468   //
 4469   // Inputs:
 4470   //   c_rarg0   - byte[]  source+offset
 4471   //   c_rarg1   - int[]   SHA.state
 4472   //   c_rarg2   - int     offset
 4473   //   c_rarg3   - int     limit
 4474   //
 4475   address generate_sha256_implCompress(StubId stub_id) {
 4476     bool multi_block;
 4477     switch (stub_id) {
 4478     case StubId::stubgen_sha256_implCompress_id:
 4479       multi_block = false;
 4480       break;
 4481     case StubId::stubgen_sha256_implCompressMB_id:
 4482       multi_block = true;
 4483       break;
 4484     default:
 4485       ShouldNotReachHere();
 4486     }
 4487     int entry_count = StubInfo::entry_count(stub_id);
 4488     assert(entry_count == 1, "sanity check");
 4489     address start = load_archive_data(stub_id);
 4490     if (start != nullptr) {
 4491       return start;
 4492     }
 4493     __ align(CodeEntryAlignment);
 4494     StubCodeMark mark(this, stub_id);
 4495     start = __ pc();
 4496 
 4497     Register buf   = c_rarg0;
 4498     Register state = c_rarg1;
 4499     Register ofs   = c_rarg2;
 4500     Register limit = c_rarg3;
 4501 
 4502     Label sha1_loop;
 4503 
 4504     __ stpd(v8, v9, __ pre(sp, -32));
 4505     __ stpd(v10, v11, Address(sp, 16));
 4506 
 4507 // dga == v0
 4508 // dgb == v1
 4509 // dg0 == v2
 4510 // dg1 == v3
 4511 // dg2 == v4
 4512 // t0 == v6
 4513 // t1 == v7
 4514 
 4515     // load 16 keys to v16..v31
 4516     __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
 4517     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 4518     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 4519     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 4520     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 4521 
 4522     // load 8 words (256 bits) state
 4523     __ ldpq(v0, v1, state);
 4524 
 4525     __ BIND(sha1_loop);
 4526     // load 64 bytes of data into v8..v11
 4527     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4528     __ rev32(v8, __ T16B, v8);
 4529     __ rev32(v9, __ T16B, v9);
 4530     __ rev32(v10, __ T16B, v10);
 4531     __ rev32(v11, __ T16B, v11);
 4532 
 4533     __ addv(v6, __ T4S, v8, v16);
 4534     __ orr(v2, __ T16B, v0, v0);
 4535     __ orr(v3, __ T16B, v1, v1);
 4536 
 4537     FloatRegister d0 = v8;
 4538     FloatRegister d1 = v9;
 4539     FloatRegister d2 = v10;
 4540     FloatRegister d3 = v11;
 4541 
 4542 
 4543     for (int round = 0; round < 16; round++) {
 4544       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4545       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4546       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4547       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4548 
 4549       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4550        __ orr(v4, __ T16B, v2, v2);
 4551       if (round < 15)
 4552         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4553       __ sha256h(v2, __ T4S, v3, tmp2);
 4554       __ sha256h2(v3, __ T4S, v4, tmp2);
 4555       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4556 
 4557       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4558     }
 4559 
 4560     __ addv(v0, __ T4S, v0, v2);
 4561     __ addv(v1, __ T4S, v1, v3);
 4562 
 4563     if (multi_block) {
 4564       __ add(ofs, ofs, 64);
 4565       __ cmp(ofs, limit);
 4566       __ br(Assembler::LE, sha1_loop);
 4567       __ mov(c_rarg0, ofs); // return ofs
 4568     }
 4569 
 4570     __ ldpd(v10, v11, Address(sp, 16));
 4571     __ ldpd(v8, v9, __ post(sp, 32));
 4572 
 4573     __ stpq(v0, v1, state);
 4574 
 4575     __ ret(lr);
 4576 
 4577     // record the stub entry and end
 4578     store_archive_data(stub_id, start, __ pc());
 4579 
 4580     return start;
 4581   }
 4582 
 4583   // Double rounds for sha512.
 4584   void sha512_dround(int dr,
 4585                      FloatRegister vi0, FloatRegister vi1,
 4586                      FloatRegister vi2, FloatRegister vi3,
 4587                      FloatRegister vi4, FloatRegister vrc0,
 4588                      FloatRegister vrc1, FloatRegister vin0,
 4589                      FloatRegister vin1, FloatRegister vin2,
 4590                      FloatRegister vin3, FloatRegister vin4) {
 4591       if (dr < 36) {
 4592         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4593       }
 4594       __ addv(v5, __ T2D, vrc0, vin0);
 4595       __ ext(v6, __ T16B, vi2, vi3, 8);
 4596       __ ext(v5, __ T16B, v5, v5, 8);
 4597       __ ext(v7, __ T16B, vi1, vi2, 8);
 4598       __ addv(vi3, __ T2D, vi3, v5);
 4599       if (dr < 32) {
 4600         __ ext(v5, __ T16B, vin3, vin4, 8);
 4601         __ sha512su0(vin0, __ T2D, vin1);
 4602       }
 4603       __ sha512h(vi3, __ T2D, v6, v7);
 4604       if (dr < 32) {
 4605         __ sha512su1(vin0, __ T2D, vin2, v5);
 4606       }
 4607       __ addv(vi4, __ T2D, vi1, vi3);
 4608       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4609   }
 4610 
 4611   // Arguments:
 4612   //
 4613   // Inputs:
 4614   //   c_rarg0   - byte[]  source+offset
 4615   //   c_rarg1   - int[]   SHA.state
 4616   //   c_rarg2   - int     offset
 4617   //   c_rarg3   - int     limit
 4618   //
 4619   address generate_sha512_implCompress(StubId stub_id) {
 4620     bool multi_block;
 4621     switch (stub_id) {
 4622     case StubId::stubgen_sha512_implCompress_id:
 4623       multi_block = false;
 4624       break;
 4625     case StubId::stubgen_sha512_implCompressMB_id:
 4626       multi_block = true;
 4627       break;
 4628     default:
 4629       ShouldNotReachHere();
 4630     }
 4631     int entry_count = StubInfo::entry_count(stub_id);
 4632     assert(entry_count == 1, "sanity check");
 4633     address start = load_archive_data(stub_id);
 4634     if (start != nullptr) {
 4635       return start;
 4636     }
 4637     __ align(CodeEntryAlignment);
 4638     StubCodeMark mark(this, stub_id);
 4639     start = __ pc();
 4640 
 4641     Register buf   = c_rarg0;
 4642     Register state = c_rarg1;
 4643     Register ofs   = c_rarg2;
 4644     Register limit = c_rarg3;
 4645 
 4646     __ stpd(v8, v9, __ pre(sp, -64));
 4647     __ stpd(v10, v11, Address(sp, 16));
 4648     __ stpd(v12, v13, Address(sp, 32));
 4649     __ stpd(v14, v15, Address(sp, 48));
 4650 
 4651     Label sha512_loop;
 4652 
 4653     // load state
 4654     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4655 
 4656     // load first 4 round constants
 4657     __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
 4658     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4659 
 4660     __ BIND(sha512_loop);
 4661     // load 128B of data into v12..v19
 4662     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4663     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4664     __ rev64(v12, __ T16B, v12);
 4665     __ rev64(v13, __ T16B, v13);
 4666     __ rev64(v14, __ T16B, v14);
 4667     __ rev64(v15, __ T16B, v15);
 4668     __ rev64(v16, __ T16B, v16);
 4669     __ rev64(v17, __ T16B, v17);
 4670     __ rev64(v18, __ T16B, v18);
 4671     __ rev64(v19, __ T16B, v19);
 4672 
 4673     __ mov(rscratch2, rscratch1);
 4674 
 4675     __ mov(v0, __ T16B, v8);
 4676     __ mov(v1, __ T16B, v9);
 4677     __ mov(v2, __ T16B, v10);
 4678     __ mov(v3, __ T16B, v11);
 4679 
 4680     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4681     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4682     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4683     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4684     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4685     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4686     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4687     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4688     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4689     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4690     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4691     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4692     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4693     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4694     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4695     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4696     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4697     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4698     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4699     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4700     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4701     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4702     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4703     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4704     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4705     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4706     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4707     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4708     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4709     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4710     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4711     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4712     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4713     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4714     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4715     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4716     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4717     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4718     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4719     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4720 
 4721     __ addv(v8, __ T2D, v8, v0);
 4722     __ addv(v9, __ T2D, v9, v1);
 4723     __ addv(v10, __ T2D, v10, v2);
 4724     __ addv(v11, __ T2D, v11, v3);
 4725 
 4726     if (multi_block) {
 4727       __ add(ofs, ofs, 128);
 4728       __ cmp(ofs, limit);
 4729       __ br(Assembler::LE, sha512_loop);
 4730       __ mov(c_rarg0, ofs); // return ofs
 4731     }
 4732 
 4733     __ st1(v8, v9, v10, v11, __ T2D, state);
 4734 
 4735     __ ldpd(v14, v15, Address(sp, 48));
 4736     __ ldpd(v12, v13, Address(sp, 32));
 4737     __ ldpd(v10, v11, Address(sp, 16));
 4738     __ ldpd(v8, v9, __ post(sp, 64));
 4739 
 4740     __ ret(lr);
 4741 
 4742     // record the stub entry and end
 4743     store_archive_data(stub_id, start, __ pc());
 4744 
 4745     return start;
 4746   }
 4747 
 4748   // Execute one round of keccak of two computations in parallel.
 4749   // One of the states should be loaded into the lower halves of
 4750   // the vector registers v0-v24, the other should be loaded into
 4751   // the upper halves of those registers. The ld1r instruction loads
 4752   // the round constant into both halves of register v31.
 4753   // Intermediate results c0...c5 and d0...d5 are computed
 4754   // in registers v25...v30.
 4755   // All vector instructions that are used operate on both register
 4756   // halves in parallel.
 4757   // If only a single computation is needed, one can only load the lower halves.
 4758   void keccak_round(Register rscratch1) {
 4759   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4760   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4761   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4762   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4763   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4764   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4765   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4766   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4767   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4768   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4769 
 4770   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4771   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4772   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4773   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4774   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4775 
 4776   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4777   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4778   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4779   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4780   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4781   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4782   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4783   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4784   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4785   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4786   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4787   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4788   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4789   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4790   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4791   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4792   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4793   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4794   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4795   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4796   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4797   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4798   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4799   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4800   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4801 
 4802   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4803   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4804   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4805   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4806   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4807 
 4808   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4809 
 4810   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4811   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4812   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4813   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4814   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4815 
 4816   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4817   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4818   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4819   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4820   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4821 
 4822   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4823   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4824   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4825   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4826   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4827 
 4828   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4829   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4830   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4831   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4832   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4833 
 4834   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4835   }
 4836 
 4837   // Arguments:
 4838   //
 4839   // Inputs:
 4840   //   c_rarg0   - byte[]  source+offset
 4841   //   c_rarg1   - byte[]  SHA.state
 4842   //   c_rarg2   - int     block_size
 4843   //   c_rarg3   - int     offset
 4844   //   c_rarg4   - int     limit
 4845   //
 4846   address generate_sha3_implCompress(StubId stub_id) {
 4847     bool multi_block;
 4848     switch (stub_id) {
 4849     case StubId::stubgen_sha3_implCompress_id:
 4850       multi_block = false;
 4851       break;
 4852     case StubId::stubgen_sha3_implCompressMB_id:
 4853       multi_block = true;
 4854       break;
 4855     default:
 4856       ShouldNotReachHere();
 4857     }
 4858     int entry_count = StubInfo::entry_count(stub_id);
 4859     assert(entry_count == 1, "sanity check");
 4860     address start = load_archive_data(stub_id);
 4861     if (start != nullptr) {
 4862       return start;
 4863     }
 4864     __ align(CodeEntryAlignment);
 4865     StubCodeMark mark(this, stub_id);
 4866     start = __ pc();
 4867 
 4868     Register buf           = c_rarg0;
 4869     Register state         = c_rarg1;
 4870     Register block_size    = c_rarg2;
 4871     Register ofs           = c_rarg3;
 4872     Register limit         = c_rarg4;
 4873 
 4874     Label sha3_loop, rounds24_loop;
 4875     Label sha3_512_or_sha3_384, shake128;
 4876 
 4877     __ stpd(v8, v9, __ pre(sp, -64));
 4878     __ stpd(v10, v11, Address(sp, 16));
 4879     __ stpd(v12, v13, Address(sp, 32));
 4880     __ stpd(v14, v15, Address(sp, 48));
 4881 
 4882     // load state
 4883     __ add(rscratch1, state, 32);
 4884     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4885     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4886     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4887     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4888     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4889     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4890     __ ld1(v24, __ T1D, rscratch1);
 4891 
 4892     __ BIND(sha3_loop);
 4893 
 4894     // 24 keccak rounds
 4895     __ movw(rscratch2, 24);
 4896 
 4897     // load round_constants base
 4898     __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
 4899 
 4900     // load input
 4901     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4902     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4903     __ eor(v0, __ T8B, v0, v25);
 4904     __ eor(v1, __ T8B, v1, v26);
 4905     __ eor(v2, __ T8B, v2, v27);
 4906     __ eor(v3, __ T8B, v3, v28);
 4907     __ eor(v4, __ T8B, v4, v29);
 4908     __ eor(v5, __ T8B, v5, v30);
 4909     __ eor(v6, __ T8B, v6, v31);
 4910 
 4911     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4912     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4913 
 4914     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4915     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4916     __ eor(v7, __ T8B, v7, v25);
 4917     __ eor(v8, __ T8B, v8, v26);
 4918     __ eor(v9, __ T8B, v9, v27);
 4919     __ eor(v10, __ T8B, v10, v28);
 4920     __ eor(v11, __ T8B, v11, v29);
 4921     __ eor(v12, __ T8B, v12, v30);
 4922     __ eor(v13, __ T8B, v13, v31);
 4923 
 4924     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4925     __ eor(v14, __ T8B, v14, v25);
 4926     __ eor(v15, __ T8B, v15, v26);
 4927     __ eor(v16, __ T8B, v16, v27);
 4928 
 4929     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4930     __ andw(c_rarg5, block_size, 48);
 4931     __ cbzw(c_rarg5, rounds24_loop);
 4932 
 4933     __ tbnz(block_size, 5, shake128);
 4934     // block_size == 144, bit5 == 0, SHA3-224
 4935     __ ldrd(v28, __ post(buf, 8));
 4936     __ eor(v17, __ T8B, v17, v28);
 4937     __ b(rounds24_loop);
 4938 
 4939     __ BIND(shake128);
 4940     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4941     __ eor(v17, __ T8B, v17, v28);
 4942     __ eor(v18, __ T8B, v18, v29);
 4943     __ eor(v19, __ T8B, v19, v30);
 4944     __ eor(v20, __ T8B, v20, v31);
 4945     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4946 
 4947     __ BIND(sha3_512_or_sha3_384);
 4948     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4949     __ eor(v7, __ T8B, v7, v25);
 4950     __ eor(v8, __ T8B, v8, v26);
 4951     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4952 
 4953     // SHA3-384
 4954     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4955     __ eor(v9,  __ T8B, v9,  v27);
 4956     __ eor(v10, __ T8B, v10, v28);
 4957     __ eor(v11, __ T8B, v11, v29);
 4958     __ eor(v12, __ T8B, v12, v30);
 4959 
 4960     __ BIND(rounds24_loop);
 4961     __ subw(rscratch2, rscratch2, 1);
 4962 
 4963     keccak_round(rscratch1);
 4964 
 4965     __ cbnzw(rscratch2, rounds24_loop);
 4966 
 4967     if (multi_block) {
 4968       __ add(ofs, ofs, block_size);
 4969       __ cmp(ofs, limit);
 4970       __ br(Assembler::LE, sha3_loop);
 4971       __ mov(c_rarg0, ofs); // return ofs
 4972     }
 4973 
 4974     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4975     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4976     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4977     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4978     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4979     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4980     __ st1(v24, __ T1D, state);
 4981 
 4982     // restore callee-saved registers
 4983     __ ldpd(v14, v15, Address(sp, 48));
 4984     __ ldpd(v12, v13, Address(sp, 32));
 4985     __ ldpd(v10, v11, Address(sp, 16));
 4986     __ ldpd(v8, v9, __ post(sp, 64));
 4987 
 4988     __ ret(lr);
 4989 
 4990     // record the stub entry and end
 4991     store_archive_data(stub_id, start, __ pc());
 4992 
 4993     return start;
 4994   }
 4995 
 4996   // Inputs:
 4997   //   c_rarg0   - long[]  state0
 4998   //   c_rarg1   - long[]  state1
 4999   address generate_double_keccak() {
 5000     StubId stub_id = StubId::stubgen_double_keccak_id;
 5001     int entry_count = StubInfo::entry_count(stub_id);
 5002     assert(entry_count == 1, "sanity check");
 5003     address start = load_archive_data(stub_id);
 5004     if (start != nullptr) {
 5005       return start;
 5006     }
 5007     // Implements the double_keccak() method of the
 5008     // sun.security.provider.SHA3Parallel class
 5009     __ align(CodeEntryAlignment);
 5010     StubCodeMark mark(this, stub_id);
 5011     start = __ pc();
 5012     __ enter();
 5013 
 5014     Register state0        = c_rarg0;
 5015     Register state1        = c_rarg1;
 5016 
 5017     Label rounds24_loop;
 5018 
 5019     // save callee-saved registers
 5020     __ stpd(v8, v9, __ pre(sp, -64));
 5021     __ stpd(v10, v11, Address(sp, 16));
 5022     __ stpd(v12, v13, Address(sp, 32));
 5023     __ stpd(v14, v15, Address(sp, 48));
 5024 
 5025     // load states
 5026     __ add(rscratch1, state0, 32);
 5027     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 5028     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 5029     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 5030     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 5031     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 5032     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 5033     __ ld1(v24, __ D, 0, rscratch1);
 5034     __ add(rscratch1, state1, 32);
 5035     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 5036     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 5037     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 5038     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 5039     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 5040     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 5041     __ ld1(v24, __ D, 1, rscratch1);
 5042 
 5043     // 24 keccak rounds
 5044     __ movw(rscratch2, 24);
 5045 
 5046     // load round_constants base
 5047     __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
 5048 
 5049     __ BIND(rounds24_loop);
 5050     __ subw(rscratch2, rscratch2, 1);
 5051     keccak_round(rscratch1);
 5052     __ cbnzw(rscratch2, rounds24_loop);
 5053 
 5054     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 5055     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 5056     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 5057     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 5058     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 5059     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 5060     __ st1(v24, __ D, 0, state0);
 5061     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 5062     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 5063     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 5064     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 5065     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 5066     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 5067     __ st1(v24, __ D, 1, state1);
 5068 
 5069     // restore callee-saved vector registers
 5070     __ ldpd(v14, v15, Address(sp, 48));
 5071     __ ldpd(v12, v13, Address(sp, 32));
 5072     __ ldpd(v10, v11, Address(sp, 16));
 5073     __ ldpd(v8, v9, __ post(sp, 64));
 5074 
 5075     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5076 
 5077     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 5078     __ ret(lr);
 5079 
 5080     // record the stub entry and end
 5081     store_archive_data(stub_id, start, __ pc());
 5082 
 5083     return start;
 5084   }
 5085 
 5086   // ChaCha20 block function.  This version parallelizes the 32-bit
 5087   // state elements on each of 16 vectors, producing 4 blocks of
 5088   // keystream at a time.
 5089   //
 5090   // state (int[16]) = c_rarg0
 5091   // keystream (byte[256]) = c_rarg1
 5092   // return - number of bytes of produced keystream (always 256)
 5093   //
 5094   // This implementation takes each 32-bit integer from the state
 5095   // array and broadcasts it across all 4 32-bit lanes of a vector register
 5096   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 5097   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 5098   // the quarter round schedule is implemented as outlined in RFC 7539 section
 5099   // 2.3.  However, instead of sequentially processing the 3 quarter round
 5100   // operations represented by one QUARTERROUND function, we instead stack all
 5101   // the adds, xors and left-rotations from the first 4 quarter rounds together
 5102   // and then do the same for the second set of 4 quarter rounds.  This removes
 5103   // some latency that would otherwise be incurred by waiting for an add to
 5104   // complete before performing an xor (which depends on the result of the
 5105   // add), etc. An adjustment happens between the first and second groups of 4
 5106   // quarter rounds, but this is done only in the inputs to the macro functions
 5107   // that generate the assembly instructions - these adjustments themselves are
 5108   // not part of the resulting assembly.
 5109   // The 4 registers v0-v3 are used during the quarter round operations as
 5110   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 5111   // registers become the vectors involved in adding the start state back onto
 5112   // the post-QR working state.  After the adds are complete, each of the 16
 5113   // vectors write their first lane back to the keystream buffer, followed
 5114   // by the second lane from all vectors and so on.
 5115   address generate_chacha20Block_blockpar() {
 5116     StubId stub_id = StubId::stubgen_chacha20Block_id;
 5117     int entry_count = StubInfo::entry_count(stub_id);
 5118     assert(entry_count == 1, "sanity check");
 5119     address start = load_archive_data(stub_id);
 5120     if (start != nullptr) {
 5121       return start;
 5122     }
 5123     Label L_twoRounds, L_cc20_const;
 5124     __ align(CodeEntryAlignment);
 5125     StubCodeMark mark(this, stub_id);
 5126     start = __ pc();
 5127     __ enter();
 5128 
 5129     int i, j;
 5130     const Register state = c_rarg0;
 5131     const Register keystream = c_rarg1;
 5132     const Register loopCtr = r10;
 5133     const Register tmpAddr = r11;
 5134     const FloatRegister ctrAddOverlay = v28;
 5135     const FloatRegister lrot8Tbl = v29;
 5136 
 5137     // Organize SIMD registers in an array that facilitates
 5138     // putting repetitive opcodes into loop structures.  It is
 5139     // important that each grouping of 4 registers is monotonically
 5140     // increasing to support the requirements of multi-register
 5141     // instructions (e.g. ld4r, st4, etc.)
 5142     const FloatRegister workSt[16] = {
 5143          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 5144         v20, v21, v22, v23, v24, v25, v26, v27
 5145     };
 5146 
 5147     // Pull in constant data.  The first 16 bytes are the add overlay
 5148     // which is applied to the vector holding the counter (state[12]).
 5149     // The second 16 bytes is the index register for the 8-bit left
 5150     // rotation tbl instruction.
 5151     __ adr(tmpAddr, L_cc20_const);
 5152     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 5153 
 5154     // Load from memory and interlace across 16 SIMD registers,
 5155     // With each word from memory being broadcast to all lanes of
 5156     // each successive SIMD register.
 5157     //      Addr(0) -> All lanes in workSt[i]
 5158     //      Addr(4) -> All lanes workSt[i + 1], etc.
 5159     __ mov(tmpAddr, state);
 5160     for (i = 0; i < 16; i += 4) {
 5161       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 5162           __ post(tmpAddr, 16));
 5163     }
 5164     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5165 
 5166     // Before entering the loop, create 5 4-register arrays.  These
 5167     // will hold the 4 registers that represent the a/b/c/d fields
 5168     // in the quarter round operation.  For instance the "b" field
 5169     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 5170     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 5171     // since it is part of a diagonal organization.  The aSet and scratch
 5172     // register sets are defined at declaration time because they do not change
 5173     // organization at any point during the 20-round processing.
 5174     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 5175     FloatRegister bSet[4];
 5176     FloatRegister cSet[4];
 5177     FloatRegister dSet[4];
 5178     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 5179 
 5180     // Set up the 10 iteration loop and perform all 8 quarter round ops
 5181     __ mov(loopCtr, 10);
 5182     __ BIND(L_twoRounds);
 5183 
 5184     // Set to columnar organization and do the following 4 quarter-rounds:
 5185     // QUARTERROUND(0, 4, 8, 12)
 5186     // QUARTERROUND(1, 5, 9, 13)
 5187     // QUARTERROUND(2, 6, 10, 14)
 5188     // QUARTERROUND(3, 7, 11, 15)
 5189     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 5190     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 5191     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 5192 
 5193     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5194     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5195     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5196 
 5197     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5198     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5199     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5200 
 5201     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5202     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5203     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5204 
 5205     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5206     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5207     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5208 
 5209     // Set to diagonal organization and do the next 4 quarter-rounds:
 5210     // QUARTERROUND(0, 5, 10, 15)
 5211     // QUARTERROUND(1, 6, 11, 12)
 5212     // QUARTERROUND(2, 7, 8, 13)
 5213     // QUARTERROUND(3, 4, 9, 14)
 5214     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 5215     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 5216     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 5217 
 5218     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5219     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5220     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5221 
 5222     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5223     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5224     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5225 
 5226     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5227     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5228     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5229 
 5230     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5231     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5232     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5233 
 5234     // Decrement and iterate
 5235     __ sub(loopCtr, loopCtr, 1);
 5236     __ cbnz(loopCtr, L_twoRounds);
 5237 
 5238     __ mov(tmpAddr, state);
 5239 
 5240     // Add the starting state back to the post-loop keystream
 5241     // state.  We read/interlace the state array from memory into
 5242     // 4 registers similar to what we did in the beginning.  Then
 5243     // add the counter overlay onto workSt[12] at the end.
 5244     for (i = 0; i < 16; i += 4) {
 5245       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 5246       __ addv(workSt[i], __ T4S, workSt[i], v0);
 5247       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 5248       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 5249       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 5250     }
 5251     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5252 
 5253     // Write working state into the keystream buffer.  This is accomplished
 5254     // by taking the lane "i" from each of the four vectors and writing
 5255     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 5256     // repeating with the next 4 vectors until all 16 vectors have been used.
 5257     // Then move to the next lane and repeat the process until all lanes have
 5258     // been written.
 5259     for (i = 0; i < 4; i++) {
 5260       for (j = 0; j < 16; j += 4) {
 5261         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 5262             __ post(keystream, 16));
 5263       }
 5264     }
 5265 
 5266     __ mov(r0, 256);             // Return length of output keystream
 5267     __ leave();
 5268     __ ret(lr);
 5269 
 5270     // bind label and generate local constant data used by this stub
 5271     // The constant data is broken into two 128-bit segments to be loaded
 5272     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 5273     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 5274     // The second 128-bits is a table constant used for 8-bit left rotations.
 5275     __ BIND(L_cc20_const);
 5276     __ emit_int64(0x0000000100000000UL);
 5277     __ emit_int64(0x0000000300000002UL);
 5278     __ emit_int64(0x0605040702010003UL);
 5279     __ emit_int64(0x0E0D0C0F0A09080BUL);
 5280 
 5281     // record the stub entry and end
 5282     store_archive_data(stub_id, start, __ pc());
 5283 
 5284     return start;
 5285   }
 5286 
 5287   // Helpers to schedule parallel operation bundles across vector
 5288   // register sequences of size 2, 4 or 8.
 5289 
 5290   // Implement various primitive computations across vector sequences
 5291 
 5292   template<int N>
 5293   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5294                const VSeq<N>& v1, const VSeq<N>& v2) {
 5295     // output must not be constant
 5296     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5297     // output cannot overwrite pending inputs
 5298     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5299     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5300     for (int i = 0; i < N; i++) {
 5301       __ addv(v[i], T, v1[i], v2[i]);
 5302     }
 5303   }
 5304 
 5305   template<int N>
 5306   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5307                const VSeq<N>& v1, const VSeq<N>& v2) {
 5308     // output must not be constant
 5309     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5310     // output cannot overwrite pending inputs
 5311     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5312     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5313     for (int i = 0; i < N; i++) {
 5314       __ subv(v[i], T, v1[i], v2[i]);
 5315     }
 5316   }
 5317 
 5318   template<int N>
 5319   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5320                const VSeq<N>& v1, const VSeq<N>& v2) {
 5321     // output must not be constant
 5322     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5323     // output cannot overwrite pending inputs
 5324     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5325     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5326     for (int i = 0; i < N; i++) {
 5327       __ mulv(v[i], T, v1[i], v2[i]);
 5328     }
 5329   }
 5330 
 5331   template<int N>
 5332   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 5333     // output must not be constant
 5334     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5335     // output cannot overwrite pending inputs
 5336     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5337     for (int i = 0; i < N; i++) {
 5338       __ negr(v[i], T, v1[i]);
 5339     }
 5340   }
 5341 
 5342   template<int N>
 5343   void vs_shl(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5344               const VSeq<N>& v1, int shift) {
 5345     // output must not be constant
 5346     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5347     // output cannot overwrite pending inputs
 5348     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5349 
 5350     for (int i = 0; i < N; i++) {
 5351       __ shl(v[i], T, v1[i], shift);
 5352     }
 5353   }
 5354 
 5355   template<int N>
 5356   void vs_ushr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5357                const VSeq<N>& v1, int shift) {
 5358     // output must not be constant
 5359     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5360     // output cannot overwrite pending inputs
 5361     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5362 
 5363     for (int i = 0; i < N; i++) {
 5364       __ ushr(v[i], T, v1[i], shift);
 5365     }
 5366   }
 5367 
 5368   template<int N>
 5369   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5370                const VSeq<N>& v1, int shift) {
 5371     // output must not be constant
 5372     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5373     // output cannot overwrite pending inputs
 5374     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5375     for (int i = 0; i < N; i++) {
 5376       __ sshr(v[i], T, v1[i], shift);
 5377     }
 5378   }
 5379 
 5380   template<int N>
 5381   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5382     // output must not be constant
 5383     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5384     // output cannot overwrite pending inputs
 5385     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5386     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5387     for (int i = 0; i < N; i++) {
 5388       __ andr(v[i], __ T16B, v1[i], v2[i]);
 5389     }
 5390   }
 5391 
 5392   template<int N>
 5393   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const FloatRegister v2) {
 5394     // output must not be constant
 5395     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5396     // output cannot overwrite pending inputs
 5397     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5398     for (int i = 0; i < N; i++) {
 5399       __ andr(v[i], __ T16B, v1[i], v2);
 5400     }
 5401   }
 5402 
 5403   template<int N>
 5404   void vs_eor(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5405     // output must not be constant
 5406     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5407     // output cannot overwrite pending inputs
 5408     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5409     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5410     for (int i = 0; i < N; i++) {
 5411       __ eor(v[i], __ T16B, v1[i], v2[i]);
 5412     }
 5413   }
 5414 
 5415   template<int N>
 5416   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5417     // output must not be constant
 5418     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5419     // output cannot overwrite pending inputs
 5420     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5421     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5422     for (int i = 0; i < N; i++) {
 5423       __ orr(v[i], __ T16B, v1[i], v2[i]);
 5424     }
 5425   }
 5426 
 5427   template<int N>
 5428   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 5429     // output must not be constant
 5430     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5431     // output cannot overwrite pending inputs
 5432     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5433     for (int i = 0; i < N; i++) {
 5434       __ notr(v[i], __ T16B, v1[i]);
 5435     }
 5436   }
 5437 
 5438   template<int N>
 5439   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 5440     // output must not be constant
 5441     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5442     // output cannot overwrite pending inputs
 5443     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5444     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5445     for (int i = 0; i < N; i++) {
 5446       __ sqdmulh(v[i], T, v1[i], v2[i]);
 5447     }
 5448   }
 5449 
 5450   template<int N>
 5451   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 5452     // output must not be constant
 5453     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5454     // output cannot overwrite pending inputs
 5455     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5456     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5457     for (int i = 0; i < N; i++) {
 5458       __ mlsv(v[i], T, v1[i], v2[i]);
 5459     }
 5460   }
 5461 
 5462   // load N/2 successive pairs of quadword values from memory in order
 5463   // into N successive vector registers of the sequence via the
 5464   // address supplied in base.
 5465   template<int N>
 5466   void vs_ldpq(const VSeq<N>& v, Register base) {
 5467     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5468     for (int i = 0; i < N; i += 2) {
 5469       __ ldpq(v[i], v[i+1], Address(base, 16 * i));
 5470     }
 5471   }
 5472 
 5473   // load N/2 successive pairs of quadword values from memory in order
 5474   // into N vector registers of the sequence via the address supplied
 5475   // in base using post-increment addressing
 5476   template<int N>
 5477   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 5478     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5479     for (int i = 0; i < N; i += 2) {
 5480       __ ldpq(v[i], v[i+1], __ post(base, 32));
 5481     }
 5482   }
 5483 
 5484   // store N successive vector registers of the sequence into N/2
 5485   // successive pairs of quadword memory locations via the address
 5486   // supplied in base using post-increment addressing
 5487   template<int N>
 5488   void vs_stpq_post(const VSeq<N>& v, Register base) {
 5489     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5490     for (int i = 0; i < N; i += 2) {
 5491       __ stpq(v[i], v[i+1], __ post(base, 32));
 5492     }
 5493   }
 5494 
 5495   // load N/2 pairs of quadword values from memory de-interleaved into
 5496   // N vector registers 2 at a time via the address supplied in base
 5497   // using post-increment addressing.
 5498   template<int N>
 5499   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5500     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5501     for (int i = 0; i < N; i += 2) {
 5502       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 5503     }
 5504   }
 5505 
 5506   // store N vector registers interleaved into N/2 pairs of quadword
 5507   // memory locations via the address supplied in base using
 5508   // post-increment addressing.
 5509   template<int N>
 5510   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5511     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5512     for (int i = 0; i < N; i += 2) {
 5513       __ st2(v[i], v[i+1], T, __ post(base, 32));
 5514     }
 5515   }
 5516 
 5517   // store two vector register sequences of length N
 5518   // interleaved into N pairs of quadword memory locations
 5519   // starting at the address supplied in dest using
 5520   // post-increment addressing.
 5521   template<int N>
 5522   void vs_st1_interleaved(VSeq<N> A, VSeq<N> B, Register dest) {
 5523     for (int i = 0; i < N; i++) {
 5524       __ st1(A[i], __ T2D, __ post(dest, 16));
 5525       __ st1(B[i], __ T2D, __ post(dest, 16));
 5526     }
 5527   }
 5528 
 5529   // load N quadword values from memory de-interleaved into N vector
 5530   // registers 3 elements at a time via the address supplied in base.
 5531   template<int N>
 5532   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5533     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5534     for (int i = 0; i < N; i += 3) {
 5535       __ ld3(v[i], v[i+1], v[i+2], T, base);
 5536     }
 5537   }
 5538 
 5539   // load N quadword values from memory de-interleaved into N vector
 5540   // registers 3 elements at a time via the address supplied in base
 5541   // using post-increment addressing.
 5542   template<int N>
 5543   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5544     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5545     for (int i = 0; i < N; i += 3) {
 5546       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 5547     }
 5548   }
 5549 
 5550   // load N/2 pairs of quadword values from memory into N vector
 5551   // registers via the address supplied in base with each pair indexed
 5552   // using the start offset plus the corresponding entry in the
 5553   // offsets array
 5554   template<int N>
 5555   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 5556     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5557     for (int i = 0; i < N/2; i++) {
 5558       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5559     }
 5560   }
 5561 
 5562   // store N vector registers into N/2 pairs of quadword memory
 5563   // locations via the address supplied in base with each pair indexed
 5564   // using the start offset plus the corresponding entry in the
 5565   // offsets array
 5566   template<int N>
 5567   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 5568     for (int i = 0; i < N/2; i++) {
 5569       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5570     }
 5571   }
 5572 
 5573   // load N single quadword values from memory into N vector registers
 5574   // via the address supplied in base with each value indexed using
 5575   // the start offset plus the corresponding entry in the offsets
 5576   // array
 5577   template<int N>
 5578   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5579                       int start, int (&offsets)[N]) {
 5580     for (int i = 0; i < N; i++) {
 5581       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5582     }
 5583   }
 5584 
 5585   // store N vector registers into N single quadword memory locations
 5586   // via the address supplied in base with each value indexed using
 5587   // the start offset plus the corresponding entry in the offsets
 5588   // array
 5589   template<int N>
 5590   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5591                       int start, int (&offsets)[N]) {
 5592     for (int i = 0; i < N; i++) {
 5593       __ str(v[i], T, Address(base, start + offsets[i]));
 5594     }
 5595   }
 5596 
 5597   // load N/2 pairs of quadword values from memory de-interleaved into
 5598   // N vector registers 2 at a time via the address supplied in base
 5599   // with each pair indexed using the start offset plus the
 5600   // corresponding entry in the offsets array
 5601   template<int N>
 5602   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5603                       Register tmp, int start, int (&offsets)[N/2]) {
 5604     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5605     for (int i = 0; i < N/2; i++) {
 5606       __ add(tmp, base, start + offsets[i]);
 5607       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5608     }
 5609   }
 5610 
 5611   // store N vector registers 2 at a time interleaved into N/2 pairs
 5612   // of quadword memory locations via the address supplied in base
 5613   // with each pair indexed using the start offset plus the
 5614   // corresponding entry in the offsets array
 5615   template<int N>
 5616   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5617                       Register tmp, int start, int (&offsets)[N/2]) {
 5618     static_assert(N > 0 && is_even(N), "sequence length must be even");
 5619     for (int i = 0; i < N/2; i++) {
 5620       __ add(tmp, base, start + offsets[i]);
 5621       __ st2(v[2*i], v[2*i+1], T, tmp);
 5622     }
 5623   }
 5624 
 5625   // Helper routines for various flavours of Montgomery multiply
 5626 
 5627   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5628   // multiplications in parallel
 5629   //
 5630 
 5631   // See the montMul() method of the sun.security.provider.ML_DSA
 5632   // class.
 5633   //
 5634   // Computes 4x4S results or 8x8H results
 5635   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5636   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5637   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5638   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5639   // Outputs: va - 4x4S or 4x8H vector register sequences
 5640   // vb, vc, vtmp and vq must all be disjoint
 5641   // va must be disjoint from all other inputs/temps or must equal vc
 5642   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5643   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5644   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5645                    Assembler::SIMD_Arrangement T,
 5646                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5647     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5648     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5649     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5650     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5651 
 5652     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5653     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5654 
 5655     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5656 
 5657     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5658     assert(vs_disjoint(va, vb), "va and vb overlap");
 5659     assert(vs_disjoint(va, vq), "va and vq overlap");
 5660     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5661     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5662 
 5663     // schedule 4 streams of instructions across the vector sequences
 5664     for (int i = 0; i < 4; i++) {
 5665       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5666       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5667     }
 5668 
 5669     for (int i = 0; i < 4; i++) {
 5670       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5671     }
 5672 
 5673     for (int i = 0; i < 4; i++) {
 5674       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5675     }
 5676 
 5677     for (int i = 0; i < 4; i++) {
 5678       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5679     }
 5680   }
 5681 
 5682   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5683   // multiplications in parallel
 5684   //
 5685 
 5686   // See the montMul() method of the sun.security.provider.ML_DSA
 5687   // class.
 5688   //
 5689   // Computes 4x4S results or 8x8H results
 5690   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5691   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5692   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5693   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5694   // Outputs: va - 4x4S or 4x8H vector register sequences
 5695   // vb, vc, vtmp and vq must all be disjoint
 5696   // va must be disjoint from all other inputs/temps or must equal vc
 5697   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5698   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5699   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5700                    Assembler::SIMD_Arrangement T,
 5701                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5702     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5703     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5704     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5705     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5706 
 5707     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5708     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5709 
 5710     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5711 
 5712     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5713     assert(vs_disjoint(va, vb), "va and vb overlap");
 5714     assert(vs_disjoint(va, vq), "va and vq overlap");
 5715     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5716     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5717 
 5718     // schedule 2 streams of instructions across the vector sequences
 5719     for (int i = 0; i < 2; i++) {
 5720       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5721       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5722     }
 5723 
 5724     for (int i = 0; i < 2; i++) {
 5725       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5726     }
 5727 
 5728     for (int i = 0; i < 2; i++) {
 5729       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5730     }
 5731 
 5732     for (int i = 0; i < 2; i++) {
 5733       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5734     }
 5735   }
 5736 
 5737   // Perform 16 16-bit Montgomery multiplications in parallel.
 5738   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5739                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5740     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5741     // It will assert that the register use is valid
 5742     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5743   }
 5744 
 5745   // Perform 32 16-bit Montgomery multiplications in parallel.
 5746   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5747                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5748     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5749     // It will assert that the register use is valid
 5750     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5751   }
 5752 
 5753   // Perform 64 16-bit Montgomery multiplications in parallel.
 5754   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5755                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5756     // Schedule two successive 4x8H multiplies via the montmul helper
 5757     // on the front and back halves of va, vb and vc. The helper will
 5758     // assert that the register use has no overlap conflicts on each
 5759     // individual call but we also need to ensure that the necessary
 5760     // disjoint/equality constraints are met across both calls.
 5761 
 5762     // vb, vc, vtmp and vq must be disjoint. va must either be
 5763     // disjoint from all other registers or equal vc
 5764 
 5765     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5766     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5767     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5768 
 5769     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5770     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5771 
 5772     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5773 
 5774     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5775     assert(vs_disjoint(va, vb), "va and vb overlap");
 5776     assert(vs_disjoint(va, vq), "va and vq overlap");
 5777     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5778 
 5779     // we multiply the front and back halves of each sequence 4 at a
 5780     // time because
 5781     //
 5782     // 1) we are currently only able to get 4-way instruction
 5783     // parallelism at best
 5784     //
 5785     // 2) we need registers for the constants in vq and temporary
 5786     // scratch registers to hold intermediate results so vtmp can only
 5787     // be a VSeq<4> which means we only have 4 scratch slots
 5788 
 5789     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5790     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5791   }
 5792 
 5793   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5794                                const VSeq<4>& vc,
 5795                                const VSeq<4>& vtmp,
 5796                                const VSeq<2>& vq) {
 5797     // compute a = montmul(a1, c)
 5798     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5799     // ouptut a1 = a0 - a
 5800     vs_subv(va1, __ T8H, va0, vc);
 5801     //    and a0 = a0 + a
 5802     vs_addv(va0, __ T8H, va0, vc);
 5803   }
 5804 
 5805   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5806                                const VSeq<4>& vb,
 5807                                const VSeq<4>& vtmp1,
 5808                                const VSeq<4>& vtmp2,
 5809                                const VSeq<2>& vq) {
 5810     // compute c = a0 - a1
 5811     vs_subv(vtmp1, __ T8H, va0, va1);
 5812     // output a0 = a0 + a1
 5813     vs_addv(va0, __ T8H, va0, va1);
 5814     // output a1 = b montmul c
 5815     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5816   }
 5817 
 5818   void load64shorts(const VSeq<8>& v, Register shorts) {
 5819     vs_ldpq_post(v, shorts);
 5820   }
 5821 
 5822   void load32shorts(const VSeq<4>& v, Register shorts) {
 5823     vs_ldpq_post(v, shorts);
 5824   }
 5825 
 5826   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5827     vs_stpq_post(v, tmpAddr);
 5828   }
 5829 
 5830   // Kyber NTT function.
 5831   // Implements
 5832   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5833   //
 5834   // coeffs (short[256]) = c_rarg0
 5835   // ntt_zetas (short[256]) = c_rarg1
 5836   address generate_kyberNtt() {
 5837     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5838     int entry_count = StubInfo::entry_count(stub_id);
 5839     assert(entry_count == 1, "sanity check");
 5840     address start = load_archive_data(stub_id);
 5841     if (start != nullptr) {
 5842       return start;
 5843     }
 5844     __ align(CodeEntryAlignment);
 5845     StubCodeMark mark(this, stub_id);
 5846     start = __ pc();
 5847     __ enter();
 5848 
 5849     const Register coeffs = c_rarg0;
 5850     const Register zetas = c_rarg1;
 5851 
 5852     const Register kyberConsts = r10;
 5853     const Register tmpAddr = r11;
 5854 
 5855     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5856     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5857     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5858 
 5859     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5860     // load the montmul constants
 5861     vs_ldpq(vq, kyberConsts);
 5862 
 5863     // Each level corresponds to an iteration of the outermost loop of the
 5864     // Java method seilerNTT(int[] coeffs). There are some differences
 5865     // from what is done in the seilerNTT() method, though:
 5866     // 1. The computation is using 16-bit signed values, we do not convert them
 5867     // to ints here.
 5868     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5869     // this array for each level, it is easier that way to fill up the vector
 5870     // registers.
 5871     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5872     // multiplications (this is because that way there should not be any
 5873     // overflow during the inverse NTT computation), here we use R = 2^16 so
 5874     // that we can use the 16-bit arithmetic in the vector unit.
 5875     //
 5876     // On each level, we fill up the vector registers in such a way that the
 5877     // array elements that need to be multiplied by the zetas go into one
 5878     // set of vector registers while the corresponding ones that don't need to
 5879     // be multiplied, go into another set.
 5880     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5881     // registers interleaving the steps of 4 identical computations,
 5882     // each done on 8 16-bit values per register.
 5883 
 5884     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5885     // to the zetas occur in discrete blocks whose size is some multiple
 5886     // of 32.
 5887 
 5888     // level 0
 5889     __ add(tmpAddr, coeffs, 256);
 5890     load64shorts(vs1, tmpAddr);
 5891     load64shorts(vs2, zetas);
 5892     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5893     __ add(tmpAddr, coeffs, 0);
 5894     load64shorts(vs1, tmpAddr);
 5895     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5896     vs_addv(vs1, __ T8H, vs1, vs2);
 5897     __ add(tmpAddr, coeffs, 0);
 5898     vs_stpq_post(vs1, tmpAddr);
 5899     __ add(tmpAddr, coeffs, 256);
 5900     vs_stpq_post(vs3, tmpAddr);
 5901     // restore montmul constants
 5902     vs_ldpq(vq, kyberConsts);
 5903     load64shorts(vs1, tmpAddr);
 5904     load64shorts(vs2, zetas);
 5905     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5906     __ add(tmpAddr, coeffs, 128);
 5907     load64shorts(vs1, tmpAddr);
 5908     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5909     vs_addv(vs1, __ T8H, vs1, vs2);
 5910     __ add(tmpAddr, coeffs, 128);
 5911     store64shorts(vs1, tmpAddr);
 5912     __ add(tmpAddr, coeffs, 384);
 5913     store64shorts(vs3, tmpAddr);
 5914 
 5915     // level 1
 5916     // restore montmul constants
 5917     vs_ldpq(vq, kyberConsts);
 5918     __ add(tmpAddr, coeffs, 128);
 5919     load64shorts(vs1, tmpAddr);
 5920     load64shorts(vs2, zetas);
 5921     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5922     __ add(tmpAddr, coeffs, 0);
 5923     load64shorts(vs1, tmpAddr);
 5924     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5925     vs_addv(vs1, __ T8H, vs1, vs2);
 5926     __ add(tmpAddr, coeffs, 0);
 5927     store64shorts(vs1, tmpAddr);
 5928     store64shorts(vs3, tmpAddr);
 5929     vs_ldpq(vq, kyberConsts);
 5930     __ add(tmpAddr, coeffs, 384);
 5931     load64shorts(vs1, tmpAddr);
 5932     load64shorts(vs2, zetas);
 5933     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5934     __ add(tmpAddr, coeffs, 256);
 5935     load64shorts(vs1, tmpAddr);
 5936     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5937     vs_addv(vs1, __ T8H, vs1, vs2);
 5938     __ add(tmpAddr, coeffs, 256);
 5939     store64shorts(vs1, tmpAddr);
 5940     store64shorts(vs3, tmpAddr);
 5941 
 5942     // level 2
 5943     vs_ldpq(vq, kyberConsts);
 5944     int offsets1[4] = { 0, 32, 128, 160 };
 5945     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5946     load64shorts(vs2, zetas);
 5947     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5948     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5949     // kyber_subv_addv64();
 5950     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5951     vs_addv(vs1, __ T8H, vs1, vs2);
 5952     __ add(tmpAddr, coeffs, 0);
 5953     vs_stpq_post(vs_front(vs1), tmpAddr);
 5954     vs_stpq_post(vs_front(vs3), tmpAddr);
 5955     vs_stpq_post(vs_back(vs1), tmpAddr);
 5956     vs_stpq_post(vs_back(vs3), tmpAddr);
 5957     vs_ldpq(vq, kyberConsts);
 5958     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5959     load64shorts(vs2, zetas);
 5960     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5961     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5962     // kyber_subv_addv64();
 5963     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5964     vs_addv(vs1, __ T8H, vs1, vs2);
 5965     __ add(tmpAddr, coeffs, 256);
 5966     vs_stpq_post(vs_front(vs1), tmpAddr);
 5967     vs_stpq_post(vs_front(vs3), tmpAddr);
 5968     vs_stpq_post(vs_back(vs1), tmpAddr);
 5969     vs_stpq_post(vs_back(vs3), tmpAddr);
 5970 
 5971     // level 3
 5972     vs_ldpq(vq, kyberConsts);
 5973     int offsets2[4] = { 0, 64, 128, 192 };
 5974     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5975     load64shorts(vs2, zetas);
 5976     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5977     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5978     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5979     vs_addv(vs1, __ T8H, vs1, vs2);
 5980     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5981     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5982 
 5983     vs_ldpq(vq, kyberConsts);
 5984     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5985     load64shorts(vs2, zetas);
 5986     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5987     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5988     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5989     vs_addv(vs1, __ T8H, vs1, vs2);
 5990     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5991     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5992 
 5993     // level 4
 5994     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5995     // so they are loaded by employing an ldr at 8 distinct offsets.
 5996 
 5997     vs_ldpq(vq, kyberConsts);
 5998     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5999     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 6000     load64shorts(vs2, zetas);
 6001     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6002     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6003     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6004     vs_addv(vs1, __ T8H, vs1, vs2);
 6005     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6006     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 6007 
 6008     vs_ldpq(vq, kyberConsts);
 6009     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 6010     load64shorts(vs2, zetas);
 6011     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6012     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6013     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6014     vs_addv(vs1, __ T8H, vs1, vs2);
 6015     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6016     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 6017 
 6018     // level 5
 6019     // At level 5 related coefficients occur in discrete blocks of size 8 so
 6020     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 6021 
 6022     vs_ldpq(vq, kyberConsts);
 6023     int offsets4[4] = { 0, 32, 64, 96 };
 6024     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6025     load32shorts(vs_front(vs2), zetas);
 6026     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6027     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6028     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6029     load32shorts(vs_front(vs2), zetas);
 6030     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6031     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6032     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6033     load32shorts(vs_front(vs2), zetas);
 6034     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6035     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6036 
 6037     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6038     load32shorts(vs_front(vs2), zetas);
 6039     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6040     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6041 
 6042     // level 6
 6043     // At level 6 related coefficients occur in discrete blocks of size 4 so
 6044     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 6045 
 6046     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6047     load32shorts(vs_front(vs2), zetas);
 6048     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6049     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6050     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6051     load32shorts(vs_front(vs2), zetas);
 6052     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6053     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6054 
 6055     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6056     load32shorts(vs_front(vs2), zetas);
 6057     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6058     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6059 
 6060     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6061     load32shorts(vs_front(vs2), zetas);
 6062     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 6063     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6064 
 6065     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6066     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6067     __ ret(lr);
 6068 
 6069     // record the stub entry and end
 6070     store_archive_data(stub_id, start, __ pc());
 6071 
 6072     return start;
 6073   }
 6074 
 6075   // Kyber Inverse NTT function
 6076   // Implements
 6077   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 6078   //
 6079   // coeffs (short[256]) = c_rarg0
 6080   // ntt_zetas (short[256]) = c_rarg1
 6081   address generate_kyberInverseNtt() {
 6082     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 6083     int entry_count = StubInfo::entry_count(stub_id);
 6084     assert(entry_count == 1, "sanity check");
 6085     address start = load_archive_data(stub_id);
 6086     if (start != nullptr) {
 6087       return start;
 6088     }
 6089     __ align(CodeEntryAlignment);
 6090     StubCodeMark mark(this, stub_id);
 6091     start = __ pc();
 6092     __ enter();
 6093 
 6094     const Register coeffs = c_rarg0;
 6095     const Register zetas = c_rarg1;
 6096 
 6097     const Register kyberConsts = r10;
 6098     const Register tmpAddr = r11;
 6099     const Register tmpAddr2 = c_rarg2;
 6100 
 6101     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 6102     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6103     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6104 
 6105     __ lea(kyberConsts,
 6106              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6107 
 6108     // level 0
 6109     // At level 0 related coefficients occur in discrete blocks of size 4 so
 6110     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 6111 
 6112     vs_ldpq(vq, kyberConsts);
 6113     int offsets4[4] = { 0, 32, 64, 96 };
 6114     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6115     load32shorts(vs_front(vs2), zetas);
 6116     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6117                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6118     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6119     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6120     load32shorts(vs_front(vs2), zetas);
 6121     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6122                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6123     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6124     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6125     load32shorts(vs_front(vs2), zetas);
 6126     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6127                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6128     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6129     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6130     load32shorts(vs_front(vs2), zetas);
 6131     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6132                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6133     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6134 
 6135     // level 1
 6136     // At level 1 related coefficients occur in discrete blocks of size 8 so
 6137     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 6138 
 6139     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6140     load32shorts(vs_front(vs2), zetas);
 6141     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6142                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6143     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6144     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6145     load32shorts(vs_front(vs2), zetas);
 6146     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6147                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6148     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6149 
 6150     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6151     load32shorts(vs_front(vs2), zetas);
 6152     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6153                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6154     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6155     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6156     load32shorts(vs_front(vs2), zetas);
 6157     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6158                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6159     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6160 
 6161     // level 2
 6162     // At level 2 coefficients occur in 8 discrete blocks of size 16
 6163     // so they are loaded by employing an ldr at 8 distinct offsets.
 6164 
 6165     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6166     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6167     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6168     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6169     vs_subv(vs1, __ T8H, vs1, vs2);
 6170     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 6171     load64shorts(vs2, zetas);
 6172     vs_ldpq(vq, kyberConsts);
 6173     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6174     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6175 
 6176     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6177     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6178     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6179     vs_subv(vs1, __ T8H, vs1, vs2);
 6180     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 6181     load64shorts(vs2, zetas);
 6182     vs_ldpq(vq, kyberConsts);
 6183     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6184     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6185 
 6186     // Barrett reduction at indexes where overflow may happen
 6187 
 6188     // load q and the multiplier for the Barrett reduction
 6189     __ add(tmpAddr, kyberConsts, 16);
 6190     vs_ldpq(vq, tmpAddr);
 6191 
 6192     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 6193     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 6194     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 6195     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6196     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6197     vs_sshr(vs2, __ T8H, vs2, 11);
 6198     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6199     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6200     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6201     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6202     vs_sshr(vs2, __ T8H, vs2, 11);
 6203     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6204     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6205 
 6206     // level 3
 6207     // From level 3 upwards coefficients occur in discrete blocks whose size is
 6208     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 6209 
 6210     int offsets2[4] = { 0, 64, 128, 192 };
 6211     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 6212     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 6213     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6214     vs_subv(vs1, __ T8H, vs1, vs2);
 6215     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 6216     load64shorts(vs2, zetas);
 6217     vs_ldpq(vq, kyberConsts);
 6218     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6219     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 6220 
 6221     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 6222     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6223     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6224     vs_subv(vs1, __ T8H, vs1, vs2);
 6225     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 6226     load64shorts(vs2, zetas);
 6227     vs_ldpq(vq, kyberConsts);
 6228     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6229     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6230 
 6231     // level 4
 6232 
 6233     int offsets1[4] = { 0, 32, 128, 160 };
 6234     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 6235     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 6236     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6237     vs_subv(vs1, __ T8H, vs1, vs2);
 6238     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 6239     load64shorts(vs2, zetas);
 6240     vs_ldpq(vq, kyberConsts);
 6241     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6242     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 6243 
 6244     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 6245     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6246     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6247     vs_subv(vs1, __ T8H, vs1, vs2);
 6248     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 6249     load64shorts(vs2, zetas);
 6250     vs_ldpq(vq, kyberConsts);
 6251     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6252     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6253 
 6254     // level 5
 6255 
 6256     __ add(tmpAddr, coeffs, 0);
 6257     load64shorts(vs1, tmpAddr);
 6258     __ add(tmpAddr, coeffs, 128);
 6259     load64shorts(vs2, tmpAddr);
 6260     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6261     vs_subv(vs1, __ T8H, vs1, vs2);
 6262     __ add(tmpAddr, coeffs, 0);
 6263     store64shorts(vs3, tmpAddr);
 6264     load64shorts(vs2, zetas);
 6265     vs_ldpq(vq, kyberConsts);
 6266     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6267     __ add(tmpAddr, coeffs, 128);
 6268     store64shorts(vs2, tmpAddr);
 6269 
 6270     load64shorts(vs1, tmpAddr);
 6271     __ add(tmpAddr, coeffs, 384);
 6272     load64shorts(vs2, tmpAddr);
 6273     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6274     vs_subv(vs1, __ T8H, vs1, vs2);
 6275     __ add(tmpAddr, coeffs, 256);
 6276     store64shorts(vs3, tmpAddr);
 6277     load64shorts(vs2, zetas);
 6278     vs_ldpq(vq, kyberConsts);
 6279     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6280     __ add(tmpAddr, coeffs, 384);
 6281     store64shorts(vs2, tmpAddr);
 6282 
 6283     // Barrett reduction at indexes where overflow may happen
 6284 
 6285     // load q and the multiplier for the Barrett reduction
 6286     __ add(tmpAddr, kyberConsts, 16);
 6287     vs_ldpq(vq, tmpAddr);
 6288 
 6289     int offsets0[2] = { 0, 256 };
 6290     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6291     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6292     vs_sshr(vs2, __ T8H, vs2, 11);
 6293     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6294     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6295 
 6296     // level 6
 6297 
 6298     __ add(tmpAddr, coeffs, 0);
 6299     load64shorts(vs1, tmpAddr);
 6300     __ add(tmpAddr, coeffs, 256);
 6301     load64shorts(vs2, tmpAddr);
 6302     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6303     vs_subv(vs1, __ T8H, vs1, vs2);
 6304     __ add(tmpAddr, coeffs, 0);
 6305     store64shorts(vs3, tmpAddr);
 6306     load64shorts(vs2, zetas);
 6307     vs_ldpq(vq, kyberConsts);
 6308     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6309     __ add(tmpAddr, coeffs, 256);
 6310     store64shorts(vs2, tmpAddr);
 6311 
 6312     __ add(tmpAddr, coeffs, 128);
 6313     load64shorts(vs1, tmpAddr);
 6314     __ add(tmpAddr, coeffs, 384);
 6315     load64shorts(vs2, tmpAddr);
 6316     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6317     vs_subv(vs1, __ T8H, vs1, vs2);
 6318     __ add(tmpAddr, coeffs, 128);
 6319     store64shorts(vs3, tmpAddr);
 6320     load64shorts(vs2, zetas);
 6321     vs_ldpq(vq, kyberConsts);
 6322     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6323     __ add(tmpAddr, coeffs, 384);
 6324     store64shorts(vs2, tmpAddr);
 6325 
 6326     // multiply by 2^-n
 6327 
 6328     // load toMont(2^-n mod q)
 6329     __ add(tmpAddr, kyberConsts, 48);
 6330     __ ldr(v29, __ Q, tmpAddr);
 6331 
 6332     vs_ldpq(vq, kyberConsts);
 6333     __ add(tmpAddr, coeffs, 0);
 6334     load64shorts(vs1, tmpAddr);
 6335     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6336     __ add(tmpAddr, coeffs, 0);
 6337     store64shorts(vs2, tmpAddr);
 6338 
 6339     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 6340     load64shorts(vs1, tmpAddr);
 6341     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6342     __ add(tmpAddr, coeffs, 128);
 6343     store64shorts(vs2, tmpAddr);
 6344 
 6345     // now tmpAddr contains coeffs + 256
 6346     load64shorts(vs1, tmpAddr);
 6347     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6348     __ add(tmpAddr, coeffs, 256);
 6349     store64shorts(vs2, tmpAddr);
 6350 
 6351     // now tmpAddr contains coeffs + 384
 6352     load64shorts(vs1, tmpAddr);
 6353     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6354     __ add(tmpAddr, coeffs, 384);
 6355     store64shorts(vs2, tmpAddr);
 6356 
 6357     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6358     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6359     __ ret(lr);
 6360 
 6361     // record the stub entry and end
 6362     store_archive_data(stub_id, start, __ pc());
 6363 
 6364     return start;
 6365   }
 6366 
 6367   // Kyber multiply polynomials in the NTT domain.
 6368   // Implements
 6369   // static int implKyberNttMult(
 6370   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 6371   //
 6372   // The actual algorithm that is used here differs from the one in the Java
 6373   // implementation, it uses Montgomery multiplications instead of Barrett
 6374   // reduction, but the end result modulo MLKEM_Q is the same. This is the
 6375   // Java equivalent of this intrinsic implementation:
 6376   // static void implKyberNttMultJava(short[] result, short[] ntta, short[] nttb) {
 6377   //         for (int m = 0; m < ML_KEM_N / 2; m++) {
 6378   //             int a0 = ntta[2 * m];
 6379   //             int a1 = ntta[2 * m + 1];
 6380   //             int b0 = nttb[2 * m];
 6381   //             int b1 = nttb[2 * m + 1];
 6382   //             int r = montMul(a0, b0) +
 6383   //                     montMul(montMul(a1, b1), MONT_ZETAS_FOR_NTT_MULT[m]);
 6384   //             result[2 * m] = (short) montMul(r, MONT_R_SQUARE_MOD_Q);
 6385   //             result[2 * m + 1] = (short) montMul(
 6386   //                     (montMul(a0, b1) + montMul(a1, b0)), MONT_R_SQUARE_MOD_Q);
 6387   //          }
 6388   // }
 6389   //
 6390   // result (short[256]) = c_rarg0
 6391   // ntta (short[256]) = c_rarg1
 6392   // nttb (short[256]) = c_rarg2
 6393   // zetas (short[128]) = c_rarg3
 6394   address generate_kyberNttMult() {
 6395     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 6396     int entry_count = StubInfo::entry_count(stub_id);
 6397     assert(entry_count == 1, "sanity check");
 6398     address start = load_archive_data(stub_id);
 6399     if (start != nullptr) {
 6400       return start;
 6401     }
 6402     __ align(CodeEntryAlignment);
 6403     StubCodeMark mark(this, stub_id);
 6404     start = __ pc();
 6405     __ enter();
 6406 
 6407     const Register result = c_rarg0;
 6408     const Register ntta = c_rarg1;
 6409     const Register nttb = c_rarg2;
 6410     const Register zetas = c_rarg3;
 6411 
 6412     const Register kyberConsts = r10;
 6413     const Register limit = r11;
 6414 
 6415     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 6416     VSeq<4> vs3(16), vs4(20);
 6417     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 6418     VSeq<2> vz(28);          // pair of zetas
 6419     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 6420 
 6421     __ lea(kyberConsts,
 6422              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6423 
 6424     Label kyberNttMult_loop;
 6425 
 6426     __ add(limit, result, 512);
 6427 
 6428     // load q and qinv
 6429     vs_ldpq(vq, kyberConsts);
 6430 
 6431     // load R^2 mod q (to convert back from Montgomery representation)
 6432     __ add(kyberConsts, kyberConsts, 64);
 6433     __ ldr(v27, __ Q, kyberConsts);
 6434 
 6435     __ BIND(kyberNttMult_loop);
 6436 
 6437     // load 16 zetas
 6438     vs_ldpq_post(vz, zetas);
 6439 
 6440     // load 2 sets of 32 coefficients from the two input arrays
 6441     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 6442     // are striped across pairs of vector registers
 6443     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 6444     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 6445     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 6446     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 6447 
 6448     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 6449     // i.e. montmul the first and second halves of vs1 in order and
 6450     // then with one sequence reversed storing the two results in vs3
 6451     //
 6452     // vs3[0] <- montmul(a0, b0)
 6453     // vs3[1] <- montmul(a1, b1)
 6454     // vs3[2] <- montmul(a0, b1)
 6455     // vs3[3] <- montmul(a1, b0)
 6456     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 6457     kyber_montmul16(vs_back(vs3),
 6458                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 6459 
 6460     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 6461     // i.e. montmul the first and second halves of vs4 in order and
 6462     // then with one sequence reversed storing the two results in vs1
 6463     //
 6464     // vs1[0] <- montmul(a2, b2)
 6465     // vs1[1] <- montmul(a3, b3)
 6466     // vs1[2] <- montmul(a2, b3)
 6467     // vs1[3] <- montmul(a3, b2)
 6468     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 6469     kyber_montmul16(vs_back(vs1),
 6470                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 6471 
 6472     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 6473     // We can schedule two montmuls at a time if we use a suitable vector
 6474     // sequence <vs3[1], vs1[1]>.
 6475     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 6476     VSeq<2> vs5(vs3[1], delta);
 6477 
 6478     // vs3[1] <- montmul(montmul(a1, b1), z0)
 6479     // vs1[1] <- montmul(montmul(a3, b3), z1)
 6480     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 6481 
 6482     // add results in pairs storing in vs3
 6483     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 6484     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 6485     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 6486 
 6487     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 6488     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 6489     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 6490 
 6491     // vs1 <- montmul(vs3, montRSquareModQ)
 6492     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 6493 
 6494     // store back the two pairs of result vectors de-interleaved as 8H elements
 6495     // i.e. storing each pairs of shorts striped across a register pair adjacent
 6496     // in memory
 6497     vs_st2_post(vs1, __ T8H, result);
 6498 
 6499     __ cmp(result, limit);
 6500     __ br(Assembler::NE, kyberNttMult_loop);
 6501 
 6502     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6503     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6504     __ ret(lr);
 6505 
 6506     // record the stub entry and end
 6507     store_archive_data(stub_id, start, __ pc());
 6508 
 6509     return start;
 6510   }
 6511 
 6512   // Kyber add 2 polynomials.
 6513   // Implements
 6514   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 6515   //
 6516   // result (short[256]) = c_rarg0
 6517   // a (short[256]) = c_rarg1
 6518   // b (short[256]) = c_rarg2
 6519   address generate_kyberAddPoly_2() {
 6520     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 6521     int entry_count = StubInfo::entry_count(stub_id);
 6522     assert(entry_count == 1, "sanity check");
 6523     address start = load_archive_data(stub_id);
 6524     if (start != nullptr) {
 6525       return start;
 6526     }
 6527     __ align(CodeEntryAlignment);
 6528     StubCodeMark mark(this, stub_id);
 6529     start = __ pc();
 6530     __ enter();
 6531 
 6532     const Register result = c_rarg0;
 6533     const Register a = c_rarg1;
 6534     const Register b = c_rarg2;
 6535 
 6536     const Register kyberConsts = r11;
 6537 
 6538     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 6539     // So, we can load, add and store the data in 3 groups of 11,
 6540     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 6541     // registers. A further constraint is that the mapping needs
 6542     // to skip callee saves. So, we allocate the register
 6543     // sequences using two 8 sequences, two 2 sequences and two
 6544     // single registers.
 6545     VSeq<8> vs1_1(0);
 6546     VSeq<2> vs1_2(16);
 6547     FloatRegister vs1_3 = v28;
 6548     VSeq<8> vs2_1(18);
 6549     VSeq<2> vs2_2(26);
 6550     FloatRegister vs2_3 = v29;
 6551 
 6552     // two constant vector sequences
 6553     VSeq<8> vc_1(31, 0);
 6554     VSeq<2> vc_2(31, 0);
 6555 
 6556     FloatRegister vc_3 = v31;
 6557     __ lea(kyberConsts,
 6558              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6559 
 6560     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6561     for (int i = 0; i < 3; i++) {
 6562       // load 80 or 88 values from a into vs1_1/2/3
 6563       vs_ldpq_post(vs1_1, a);
 6564       vs_ldpq_post(vs1_2, a);
 6565       if (i < 2) {
 6566         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6567       }
 6568       // load 80 or 88 values from b into vs2_1/2/3
 6569       vs_ldpq_post(vs2_1, b);
 6570       vs_ldpq_post(vs2_2, b);
 6571       if (i < 2) {
 6572         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6573       }
 6574       // sum 80 or 88 values across vs1 and vs2 into vs1
 6575       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6576       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6577       if (i < 2) {
 6578         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6579       }
 6580       // add constant to all 80 or 88 results
 6581       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6582       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6583       if (i < 2) {
 6584         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6585       }
 6586       // store 80 or 88 values
 6587       vs_stpq_post(vs1_1, result);
 6588       vs_stpq_post(vs1_2, result);
 6589       if (i < 2) {
 6590         __ str(vs1_3, __ Q, __ post(result, 16));
 6591       }
 6592     }
 6593 
 6594     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6595     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6596     __ ret(lr);
 6597 
 6598     // record the stub entry and end
 6599     store_archive_data(stub_id, start, __ pc());
 6600 
 6601     return start;
 6602   }
 6603 
 6604   // Kyber add 3 polynomials.
 6605   // Implements
 6606   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 6607   //
 6608   // result (short[256]) = c_rarg0
 6609   // a (short[256]) = c_rarg1
 6610   // b (short[256]) = c_rarg2
 6611   // c (short[256]) = c_rarg3
 6612   address generate_kyberAddPoly_3() {
 6613     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 6614     int entry_count = StubInfo::entry_count(stub_id);
 6615     assert(entry_count == 1, "sanity check");
 6616     address start = load_archive_data(stub_id);
 6617     if (start != nullptr) {
 6618       return start;
 6619     }
 6620     __ align(CodeEntryAlignment);
 6621     StubCodeMark mark(this, stub_id);
 6622     start = __ pc();
 6623     __ enter();
 6624 
 6625     const Register result = c_rarg0;
 6626     const Register a = c_rarg1;
 6627     const Register b = c_rarg2;
 6628     const Register c = c_rarg3;
 6629 
 6630     const Register kyberConsts = r11;
 6631 
 6632     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6633     // quadwords.  So, we can load, add and store the data in 3
 6634     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6635     // of 10 or 11 registers. A further constraint is that the
 6636     // mapping needs to skip callee saves. So, we allocate the
 6637     // register sequences using two 8 sequences, two 2 sequences
 6638     // and two single registers.
 6639     VSeq<8> vs1_1(0);
 6640     VSeq<2> vs1_2(16);
 6641     FloatRegister vs1_3 = v28;
 6642     VSeq<8> vs2_1(18);
 6643     VSeq<2> vs2_2(26);
 6644     FloatRegister vs2_3 = v29;
 6645 
 6646     // two constant vector sequences
 6647     VSeq<8> vc_1(31, 0);
 6648     VSeq<2> vc_2(31, 0);
 6649 
 6650     FloatRegister vc_3 = v31;
 6651 
 6652     __ lea(kyberConsts,
 6653              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6654 
 6655     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6656     for (int i = 0; i < 3; i++) {
 6657       // load 80 or 88 values from a into vs1_1/2/3
 6658       vs_ldpq_post(vs1_1, a);
 6659       vs_ldpq_post(vs1_2, a);
 6660       if (i < 2) {
 6661         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6662       }
 6663       // load 80 or 88 values from b into vs2_1/2/3
 6664       vs_ldpq_post(vs2_1, b);
 6665       vs_ldpq_post(vs2_2, b);
 6666       if (i < 2) {
 6667         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6668       }
 6669       // sum 80 or 88 values across vs1 and vs2 into vs1
 6670       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6671       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6672       if (i < 2) {
 6673         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6674       }
 6675       // load 80 or 88 values from c into vs2_1/2/3
 6676       vs_ldpq_post(vs2_1, c);
 6677       vs_ldpq_post(vs2_2, c);
 6678       if (i < 2) {
 6679         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6680       }
 6681       // sum 80 or 88 values across vs1 and vs2 into vs1
 6682       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6683       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6684       if (i < 2) {
 6685         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6686       }
 6687       // add constant to all 80 or 88 results
 6688       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6689       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6690       if (i < 2) {
 6691         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6692       }
 6693       // store 80 or 88 values
 6694       vs_stpq_post(vs1_1, result);
 6695       vs_stpq_post(vs1_2, result);
 6696       if (i < 2) {
 6697         __ str(vs1_3, __ Q, __ post(result, 16));
 6698       }
 6699     }
 6700 
 6701     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6702     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6703     __ ret(lr);
 6704 
 6705     // record the stub entry and end
 6706     store_archive_data(stub_id, start, __ pc());
 6707 
 6708     return start;
 6709   }
 6710 
 6711   // Kyber parse XOF output to polynomial coefficient candidates
 6712   // or decodePoly(12, ...).
 6713   // Implements
 6714   // static int implKyber12To16(
 6715   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6716   //
 6717   // we assume that parsed and condensed are allocated such that for
 6718   // n = (parsedLength + 63) / 64
 6719   // n blocks of 96 bytes of input can be processed, i.e.
 6720   // index + n * 96 <= condensed.length and
 6721   // n * 64 <= parsed.length
 6722   //
 6723   // condensed (byte[]) = c_rarg0
 6724   // condensedIndex = c_rarg1
 6725   // parsed (short[]) = c_rarg2
 6726   // parsedLength = c_rarg3
 6727   address generate_kyber12To16() {
 6728     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6729     int entry_count = StubInfo::entry_count(stub_id);
 6730     assert(entry_count == 1, "sanity check");
 6731     address start = load_archive_data(stub_id);
 6732     if (start != nullptr) {
 6733       return start;
 6734     }
 6735     Label L_F00, L_loop;
 6736 
 6737     __ align(CodeEntryAlignment);
 6738     StubCodeMark mark(this, stub_id);
 6739     start = __ pc();
 6740     __ enter();
 6741 
 6742     const Register condensed = c_rarg0;
 6743     const Register condensedOffs = c_rarg1;
 6744     const Register parsed = c_rarg2;
 6745     const Register parsedLength = c_rarg3;
 6746 
 6747     const Register tmpAddr = r11;
 6748 
 6749     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6750     // quadwords so we need a 6 vector sequence for the inputs.
 6751     // Parsing produces 64 shorts, employing two 8 vector
 6752     // sequences to store and combine the intermediate data.
 6753     VSeq<6> vin(24);
 6754     VSeq<8> va(0), vb(16);
 6755 
 6756     __ adr(tmpAddr, L_F00);
 6757     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6758     __ add(condensed, condensed, condensedOffs);
 6759 
 6760     __ BIND(L_loop);
 6761     // load 96 (6 x 16B) byte values
 6762     vs_ld3_post(vin, __ T16B, condensed);
 6763 
 6764     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6765     // holds 48 (16x3) contiguous bytes from memory striped
 6766     // horizontally across each of the 16 byte lanes. Equivalently,
 6767     // that is 16 pairs of 12-bit integers. Likewise the back half
 6768     // holds the next 48 bytes in the same arrangement.
 6769 
 6770     // Each vector in the front half can also be viewed as a vertical
 6771     // strip across the 16 pairs of 12 bit integers. Each byte in
 6772     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6773     // byte in vin[1] stores the high 4 bits of the first int and the
 6774     // low 4 bits of the second int. Each byte in vin[2] stores the
 6775     // high 8 bits of the second int. Likewise the vectors in second
 6776     // half.
 6777 
 6778     // Converting the data to 16-bit shorts requires first of all
 6779     // expanding each of the 6 x 16B vectors into 6 corresponding
 6780     // pairs of 8H vectors. Mask, shift and add operations on the
 6781     // resulting vector pairs can be used to combine 4 and 8 bit
 6782     // parts of related 8H vector elements.
 6783     //
 6784     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6785     // twice, one copy manipulated to provide the lower 4 bits
 6786     // belonging to the first short in a pair and another copy
 6787     // manipulated to provide the higher 4 bits belonging to the
 6788     // second short in a pair. This is why the vector sequences va
 6789     // and vb are used to hold the expanded 8H elements are of length 8.
 6790 
 6791     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6792     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6793     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6794     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6795     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6796     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6797     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6798     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6799 
 6800     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6801     // and vb[4:5]
 6802     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6803     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6804     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6805     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6806     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6807     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6808 
 6809     // shift lo byte of copy 1 of the middle stripe into the high byte
 6810     __ shl(va[2], __ T8H, va[2], 8);
 6811     __ shl(va[3], __ T8H, va[3], 8);
 6812     __ shl(vb[2], __ T8H, vb[2], 8);
 6813     __ shl(vb[3], __ T8H, vb[3], 8);
 6814 
 6815     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6816     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6817     // are in bit positions [4..11].
 6818     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6819     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6820     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6821     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6822 
 6823     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6824     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6825     // copy2
 6826     __ andr(va[2], __ T16B, va[2], v31);
 6827     __ andr(va[3], __ T16B, va[3], v31);
 6828     __ ushr(va[4], __ T8H, va[4], 4);
 6829     __ ushr(va[5], __ T8H, va[5], 4);
 6830     __ andr(vb[2], __ T16B, vb[2], v31);
 6831     __ andr(vb[3], __ T16B, vb[3], v31);
 6832     __ ushr(vb[4], __ T8H, vb[4], 4);
 6833     __ ushr(vb[5], __ T8H, vb[5], 4);
 6834 
 6835     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6836     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6837     // n.b. the ordering ensures: i) inputs are consumed before they
 6838     // are overwritten ii) the order of 16-bit results across successive
 6839     // pairs of vectors in va and then vb reflects the order of the
 6840     // corresponding 12-bit inputs
 6841     __ addv(va[0], __ T8H, va[0], va[2]);
 6842     __ addv(va[2], __ T8H, va[1], va[3]);
 6843     __ addv(va[1], __ T8H, va[4], va[6]);
 6844     __ addv(va[3], __ T8H, va[5], va[7]);
 6845     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6846     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6847     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6848     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6849 
 6850     // store 64 results interleaved as shorts
 6851     vs_st2_post(vs_front(va), __ T8H, parsed);
 6852     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6853 
 6854     __ sub(parsedLength, parsedLength, 64);
 6855     __ cmp(parsedLength, (u1)0);
 6856     __ br(Assembler::GT, L_loop);
 6857 
 6858     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6859     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6860     __ ret(lr);
 6861 
 6862     // bind label and generate constant data used by this stub
 6863     __ BIND(L_F00);
 6864     __ emit_int64(0x0f000f000f000f00);
 6865     __ emit_int64(0x0f000f000f000f00);
 6866 
 6867     // record the stub entry and end
 6868     store_archive_data(stub_id, start, __ pc());
 6869 
 6870     return start;
 6871   }
 6872 
 6873   // Kyber Barrett reduce function.
 6874   // Implements
 6875   // static int implKyberBarrettReduce(short[] coeffs) {}
 6876   //
 6877   // coeffs (short[256]) = c_rarg0
 6878   address generate_kyberBarrettReduce() {
 6879     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6880     int entry_count = StubInfo::entry_count(stub_id);
 6881     assert(entry_count == 1, "sanity check");
 6882     address start = load_archive_data(stub_id);
 6883     if (start != nullptr) {
 6884       return start;
 6885     }
 6886     __ align(CodeEntryAlignment);
 6887     StubCodeMark mark(this, stub_id);
 6888     start = __ pc();
 6889     __ enter();
 6890 
 6891     const Register coeffs = c_rarg0;
 6892 
 6893     const Register kyberConsts = r10;
 6894     const Register result = r11;
 6895 
 6896     // As above we process 256 sets of values in total i.e. 32 x
 6897     // 8H quadwords. So, we can load, add and store the data in 3
 6898     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6899     // of 10 or 11 registers. A further constraint is that the
 6900     // mapping needs to skip callee saves. So, we allocate the
 6901     // register sequences using two 8 sequences, two 2 sequences
 6902     // and two single registers.
 6903     VSeq<8> vs1_1(0);
 6904     VSeq<2> vs1_2(16);
 6905     FloatRegister vs1_3 = v28;
 6906     VSeq<8> vs2_1(18);
 6907     VSeq<2> vs2_2(26);
 6908     FloatRegister vs2_3 = v29;
 6909 
 6910     // we also need a pair of corresponding constant sequences
 6911 
 6912     VSeq<8> vc1_1(30, 0);
 6913     VSeq<2> vc1_2(30, 0);
 6914     FloatRegister vc1_3 = v30; // for kyber_q
 6915 
 6916     VSeq<8> vc2_1(31, 0);
 6917     VSeq<2> vc2_2(31, 0);
 6918     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6919 
 6920     __ add(result, coeffs, 0);
 6921     __ lea(kyberConsts,
 6922              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6923 
 6924     // load q and the multiplier for the Barrett reduction
 6925     __ add(kyberConsts, kyberConsts, 16);
 6926     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6927 
 6928     for (int i = 0; i < 3; i++) {
 6929       // load 80 or 88 coefficients
 6930       vs_ldpq_post(vs1_1, coeffs);
 6931       vs_ldpq_post(vs1_2, coeffs);
 6932       if (i < 2) {
 6933         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6934       }
 6935 
 6936       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6937       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6938       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6939       if (i < 2) {
 6940         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6941       }
 6942 
 6943       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6944       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6945       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6946       if (i < 2) {
 6947         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6948       }
 6949 
 6950       // vs1 <- vs1 - vs2 * kyber_q
 6951       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6952       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6953       if (i < 2) {
 6954         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6955       }
 6956 
 6957       vs_stpq_post(vs1_1, result);
 6958       vs_stpq_post(vs1_2, result);
 6959       if (i < 2) {
 6960         __ str(vs1_3, __ Q, __ post(result, 16));
 6961       }
 6962     }
 6963 
 6964     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6965     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 6966     __ ret(lr);
 6967 
 6968     // record the stub entry and end
 6969     store_archive_data(stub_id, start, __ pc());
 6970 
 6971     return start;
 6972   }
 6973 
 6974 
 6975   // Dilithium-specific montmul helper routines that generate parallel
 6976   // code for, respectively, a single 4x4s vector sequence montmul or
 6977   // two such multiplies in a row.
 6978 
 6979   // Perform 16 32-bit Montgomery multiplications in parallel
 6980   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6981                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6982     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6983     // It will assert that the register use is valid
 6984     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6985   }
 6986 
 6987   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6988   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6989                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6990     // Schedule two successive 4x4S multiplies via the montmul helper
 6991     // on the front and back halves of va, vb and vc. The helper will
 6992     // assert that the register use has no overlap conflicts on each
 6993     // individual call but we also need to ensure that the necessary
 6994     // disjoint/equality constraints are met across both calls.
 6995 
 6996     // vb, vc, vtmp and vq must be disjoint. va must either be
 6997     // disjoint from all other registers or equal vc
 6998 
 6999     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 7000     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 7001     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 7002 
 7003     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 7004     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 7005 
 7006     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 7007 
 7008     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 7009     assert(vs_disjoint(va, vb), "va and vb overlap");
 7010     assert(vs_disjoint(va, vq), "va and vq overlap");
 7011     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 7012 
 7013     // We multiply the front and back halves of each sequence 4 at a
 7014     // time because
 7015     //
 7016     // 1) we are currently only able to get 4-way instruction
 7017     // parallelism at best
 7018     //
 7019     // 2) we need registers for the constants in vq and temporary
 7020     // scratch registers to hold intermediate results so vtmp can only
 7021     // be a VSeq<4> which means we only have 4 scratch slots.
 7022 
 7023     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 7024     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 7025   }
 7026 
 7027   // Perform combined montmul then add/sub on 4x4S vectors.
 7028   void dilithium_montmul16_sub_add(
 7029           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 7030           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 7031     // compute a = montmul(a1, c)
 7032     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 7033     // ouptut a1 = a0 - a
 7034     vs_subv(va1, __ T4S, va0, vc);
 7035     //    and a0 = a0 + a
 7036     vs_addv(va0, __ T4S, va0, vc);
 7037   }
 7038 
 7039   // Perform combined add/sub then montmul on 4x4S vectors.
 7040   void dilithium_sub_add_montmul16(
 7041           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 7042           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 7043     // compute c = a0 - a1
 7044     vs_subv(vtmp1, __ T4S, va0, va1);
 7045     // output a0 = a0 + a1
 7046     vs_addv(va0, __ T4S, va0, va1);
 7047     // output a1 = b montmul c
 7048     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 7049   }
 7050 
 7051   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 7052   // in the Java implementation come in sequences of at least 8, so we
 7053   // can use ldpq to collect the corresponding data into pairs of vector
 7054   // registers.
 7055   // We collect the coefficients corresponding to the 'j+l' indexes into
 7056   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 7057   // then we do the (Montgomery) multiplications by the zetas in parallel
 7058   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 7059   // v0-v7, then do the additions into v24-v31 and the subtractions into
 7060   // v0-v7 and finally save the results back to the coeffs array.
 7061   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 7062     const Register coeffs, const Register zetas) {
 7063     int c1 = 0;
 7064     int c2 = 512;
 7065     int startIncr;
 7066     // don't use callee save registers v8 - v15
 7067     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7068     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7069     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7070     int offsets[4] = { 0, 32, 64, 96 };
 7071 
 7072     for (int level = 0; level < 5; level++) {
 7073       int c1Start = c1;
 7074       int c2Start = c2;
 7075       if (level == 3) {
 7076         offsets[1] = 32;
 7077         offsets[2] = 128;
 7078         offsets[3] = 160;
 7079       } else if (level == 4) {
 7080         offsets[1] = 64;
 7081         offsets[2] = 128;
 7082         offsets[3] = 192;
 7083       }
 7084 
 7085       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 7086       // time at 4 different offsets and multiply them in order by the
 7087       // next set of input values. So we employ indexed load and store
 7088       // pair instructions with arrangement 4S.
 7089       for (int i = 0; i < 4; i++) {
 7090         // reload q and qinv
 7091         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7092         // load 8x4S coefficients via second start pos == c2
 7093         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 7094         // load next 8x4S inputs == b
 7095         vs_ldpq_post(vs2, zetas);
 7096         // compute a == c2 * b mod MONT_Q
 7097         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7098         // load 8x4s coefficients via first start pos == c1
 7099         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7100         // compute a1 =  c1 + a
 7101         vs_addv(vs3, __ T4S, vs1, vs2);
 7102         // compute a2 =  c1 - a
 7103         vs_subv(vs1, __ T4S, vs1, vs2);
 7104         // output a1 and a2
 7105         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7106         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 7107 
 7108         int k = 4 * level + i;
 7109 
 7110         if (k > 7) {
 7111           startIncr = 256;
 7112         } else if (k == 5) {
 7113           startIncr = 384;
 7114         } else {
 7115           startIncr = 128;
 7116         }
 7117 
 7118         c1Start += startIncr;
 7119         c2Start += startIncr;
 7120       }
 7121 
 7122       c2 /= 2;
 7123     }
 7124   }
 7125 
 7126   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 7127   // Implements the method
 7128   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 7129   // of the Java class sun.security.provider
 7130   //
 7131   // coeffs (int[256]) = c_rarg0
 7132   // zetas (int[256]) = c_rarg1
 7133   address generate_dilithiumAlmostNtt() {
 7134     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 7135     int entry_count = StubInfo::entry_count(stub_id);
 7136     assert(entry_count == 1, "sanity check");
 7137     address start = load_archive_data(stub_id);
 7138     if (start != nullptr) {
 7139       return start;
 7140     }
 7141     __ align(CodeEntryAlignment);
 7142     StubCodeMark mark(this, stub_id);
 7143     start = __ pc();
 7144     __ enter();
 7145 
 7146     const Register coeffs = c_rarg0;
 7147     const Register zetas = c_rarg1;
 7148 
 7149     const Register tmpAddr = r9;
 7150     const Register dilithiumConsts = r10;
 7151     const Register result = r11;
 7152     // don't use callee save registers v8 - v15
 7153     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7154     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7155     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7156     int offsets[4] = { 0, 32, 64, 96};
 7157     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7158     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7159     __ add(result, coeffs, 0);
 7160     __ lea(dilithiumConsts,
 7161              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7162 
 7163     // Each level represents one iteration of the outer for loop of the Java version.
 7164 
 7165     // level 0-4
 7166     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 7167 
 7168     // level 5
 7169 
 7170     // At level 5 the coefficients we need to combine with the zetas
 7171     // are grouped in memory in blocks of size 4. So, for both sets of
 7172     // coefficients we load 4 adjacent values at 8 different offsets
 7173     // using an indexed ldr with register variant Q and multiply them
 7174     // in sequence order by the next set of inputs. Likewise we store
 7175     // the results using an indexed str with register variant Q.
 7176     for (int i = 0; i < 1024; i += 256) {
 7177       // reload constants q, qinv each iteration as they get clobbered later
 7178       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7179       // load 32 (8x4S) coefficients via first offsets = c1
 7180       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7181       // load next 32 (8x4S) inputs = b
 7182       vs_ldpq_post(vs2, zetas);
 7183       // a = b montul c1
 7184       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7185       // load 32 (8x4S) coefficients via second offsets = c2
 7186       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 7187       // add/sub with result of multiply
 7188       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 7189       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 7190       // write back new coefficients using same offsets
 7191       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 7192       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 7193     }
 7194 
 7195     // level 6
 7196     // At level 6 the coefficients we need to combine with the zetas
 7197     // are grouped in memory in pairs, the first two being montmul
 7198     // inputs and the second add/sub inputs. We can still implement
 7199     // the montmul+sub+add using 4-way parallelism but only if we
 7200     // combine the coefficients with the zetas 16 at a time. We load 8
 7201     // adjacent values at 4 different offsets using an ld2 load with
 7202     // arrangement 2D. That interleaves the lower and upper halves of
 7203     // each pair of quadwords into successive vector registers. We
 7204     // then need to montmul the 4 even elements of the coefficients
 7205     // register sequence by the zetas in order and then add/sub the 4
 7206     // odd elements of the coefficients register sequence. We use an
 7207     // equivalent st2 operation to store the results back into memory
 7208     // de-interleaved.
 7209     for (int i = 0; i < 1024; i += 128) {
 7210       // reload constants q, qinv each iteration as they get clobbered later
 7211       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7212       // load interleaved 16 (4x2D) coefficients via offsets
 7213       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7214       // load next 16 (4x4S) inputs
 7215       vs_ldpq_post(vs_front(vs2), zetas);
 7216       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7217       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7218                                   vs_front(vs2), vtmp, vq);
 7219       // store interleaved 16 (4x2D) coefficients via offsets
 7220       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7221     }
 7222 
 7223     // level 7
 7224     // At level 7 the coefficients we need to combine with the zetas
 7225     // occur singly with montmul inputs alternating with add/sub
 7226     // inputs. Once again we can use 4-way parallelism to combine 16
 7227     // zetas at a time. However, we have to load 8 adjacent values at
 7228     // 4 different offsets using an ld2 load with arrangement 4S. That
 7229     // interleaves the odd words of each pair into one
 7230     // coefficients vector register and the even words of the pair
 7231     // into the next register. We then need to montmul the 4 even
 7232     // elements of the coefficients register sequence by the zetas in
 7233     // order and then add/sub the 4 odd elements of the coefficients
 7234     // register sequence. We use an equivalent st2 operation to store
 7235     // the results back into memory de-interleaved.
 7236 
 7237     for (int i = 0; i < 1024; i += 128) {
 7238       // reload constants q, qinv each iteration as they get clobbered later
 7239       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7240       // load interleaved 16 (4x4S) coefficients via offsets
 7241       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7242       // load next 16 (4x4S) inputs
 7243       vs_ldpq_post(vs_front(vs2), zetas);
 7244       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7245       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7246                                   vs_front(vs2), vtmp, vq);
 7247       // store interleaved 16 (4x4S) coefficients via offsets
 7248       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7249     }
 7250     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7251     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7252     __ ret(lr);
 7253 
 7254     // record the stub entry and end
 7255     store_archive_data(stub_id, start, __ pc());
 7256 
 7257     return start;
 7258   }
 7259 
 7260   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 7261   // in the Java implementation come in sequences of at least 8, so we
 7262   // can use ldpq to collect the corresponding data into pairs of vector
 7263   // registers
 7264   // We collect the coefficients that correspond to the 'j's into vs1
 7265   // the coefficiets that correspond to the 'j+l's into vs2 then
 7266   // do the additions into vs3 and the subtractions into vs1 then
 7267   // save the result of the additions, load the zetas into vs2
 7268   // do the (Montgomery) multiplications by zeta in parallel into vs2
 7269   // finally save the results back to the coeffs array
 7270   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 7271     const Register coeffs, const Register zetas) {
 7272     int c1 = 0;
 7273     int c2 = 32;
 7274     int startIncr;
 7275     int offsets[4];
 7276     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7277     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7278     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7279 
 7280     offsets[0] = 0;
 7281 
 7282     for (int level = 3; level < 8; level++) {
 7283       int c1Start = c1;
 7284       int c2Start = c2;
 7285       if (level == 3) {
 7286         offsets[1] = 64;
 7287         offsets[2] = 128;
 7288         offsets[3] = 192;
 7289       } else if (level == 4) {
 7290         offsets[1] = 32;
 7291         offsets[2] = 128;
 7292         offsets[3] = 160;
 7293       } else {
 7294         offsets[1] = 32;
 7295         offsets[2] = 64;
 7296         offsets[3] = 96;
 7297       }
 7298 
 7299       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 7300       // time at 4 different offsets and multiply them in order by the
 7301       // next set of input values. So we employ indexed load and store
 7302       // pair instructions with arrangement 4S.
 7303       for (int i = 0; i < 4; i++) {
 7304         // load v1 32 (8x4S) coefficients relative to first start index
 7305         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7306         // load v2 32 (8x4S) coefficients relative to second start index
 7307         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 7308         // a0 = v1 + v2 -- n.b. clobbers vqs
 7309         vs_addv(vs3, __ T4S, vs1, vs2);
 7310         // a1 = v1 - v2
 7311         vs_subv(vs1, __ T4S, vs1, vs2);
 7312         // save a1 relative to first start index
 7313         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7314         // load constants q, qinv each iteration as they get clobbered above
 7315         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7316         // load b next 32 (8x4S) inputs
 7317         vs_ldpq_post(vs2, zetas);
 7318         // a = a1 montmul b
 7319         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7320         // save a relative to second start index
 7321         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 7322 
 7323         int k = 4 * level + i;
 7324 
 7325         if (k < 24) {
 7326           startIncr = 256;
 7327         } else if (k == 25) {
 7328           startIncr = 384;
 7329         } else {
 7330           startIncr = 128;
 7331         }
 7332 
 7333         c1Start += startIncr;
 7334         c2Start += startIncr;
 7335       }
 7336 
 7337       c2 *= 2;
 7338     }
 7339   }
 7340 
 7341   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 7342   // Implements the method
 7343   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 7344   // the sun.security.provider.ML_DSA class.
 7345   //
 7346   // coeffs (int[256]) = c_rarg0
 7347   // zetas (int[256]) = c_rarg1
 7348   address generate_dilithiumAlmostInverseNtt() {
 7349     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 7350     int entry_count = StubInfo::entry_count(stub_id);
 7351     assert(entry_count == 1, "sanity check");
 7352     address start = load_archive_data(stub_id);
 7353     if (start != nullptr) {
 7354       return start;
 7355     }
 7356     __ align(CodeEntryAlignment);
 7357     StubCodeMark mark(this, stub_id);
 7358     start = __ pc();
 7359     __ enter();
 7360 
 7361     const Register coeffs = c_rarg0;
 7362     const Register zetas = c_rarg1;
 7363 
 7364     const Register tmpAddr = r9;
 7365     const Register dilithiumConsts = r10;
 7366     const Register result = r11;
 7367     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7368     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 7369     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7370     int offsets[4] = { 0, 32, 64, 96 };
 7371     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7372     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7373 
 7374     __ add(result, coeffs, 0);
 7375     __ lea(dilithiumConsts,
 7376              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7377 
 7378     // Each level represents one iteration of the outer for loop of the Java version
 7379 
 7380     // level 0
 7381     // At level 0 we need to interleave adjacent quartets of
 7382     // coefficients before we multiply and add/sub by the next 16
 7383     // zetas just as we did for level 7 in the multiply code. So we
 7384     // load and store the values using an ld2/st2 with arrangement 4S.
 7385     for (int i = 0; i < 1024; i += 128) {
 7386       // load constants q, qinv
 7387       // n.b. this can be moved out of the loop as they do not get
 7388       // clobbered by first two loops
 7389       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7390       // a0/a1 load interleaved 32 (8x4S) coefficients
 7391       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7392       // b load next 32 (8x4S) inputs
 7393       vs_ldpq_post(vs_front(vs2), zetas);
 7394       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7395       // n.b. second half of vs2 provides temporary register storage
 7396       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7397                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7398       // a0/a1 store interleaved 32 (8x4S) coefficients
 7399       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7400     }
 7401 
 7402     // level 1
 7403     // At level 1 we need to interleave pairs of adjacent pairs of
 7404     // coefficients before we multiply by the next 16 zetas just as we
 7405     // did for level 6 in the multiply code. So we load and store the
 7406     // values an ld2/st2 with arrangement 2D.
 7407     for (int i = 0; i < 1024; i += 128) {
 7408       // a0/a1 load interleaved 32 (8x2D) coefficients
 7409       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7410       // b load next 16 (4x4S) inputs
 7411       vs_ldpq_post(vs_front(vs2), zetas);
 7412       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7413       // n.b. second half of vs2 provides temporary register storage
 7414       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7415                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7416       // a0/a1 store interleaved 32 (8x2D) coefficients
 7417       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7418     }
 7419 
 7420     // level 2
 7421     // At level 2 coefficients come in blocks of 4. So, we load 4
 7422     // adjacent coefficients at 8 distinct offsets for both the first
 7423     // and second coefficient sequences, using an ldr with register
 7424     // variant Q then combine them with next set of 32 zetas. Likewise
 7425     // we store the results using an str with register variant Q.
 7426     for (int i = 0; i < 1024; i += 256) {
 7427       // c0 load 32 (8x4S) coefficients via first offsets
 7428       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7429       // c1 load 32 (8x4S) coefficients via second offsets
 7430       vs_ldr_indexed(vs2, __ Q, coeffs, i, offsets2);
 7431       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 7432       vs_addv(vs3, __ T4S, vs1, vs2);
 7433       // c = c0 - c1
 7434       vs_subv(vs1, __ T4S, vs1, vs2);
 7435       // store a0 32 (8x4S) coefficients via first offsets
 7436       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 7437       // b load 32 (8x4S) next inputs
 7438       vs_ldpq_post(vs2, zetas);
 7439       // reload constants q, qinv -- they were clobbered earlier
 7440       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7441       // compute a1 = b montmul c
 7442       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7443       // store a1 32 (8x4S) coefficients via second offsets
 7444       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 7445     }
 7446 
 7447     // level 3-7
 7448     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 7449 
 7450     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7451     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7452     __ ret(lr);
 7453 
 7454     // record the stub entry and end
 7455     store_archive_data(stub_id, start, __ pc());
 7456 
 7457     return start;
 7458   }
 7459 
 7460   // Dilithium multiply polynomials in the NTT domain.
 7461   // Straightforward implementation of the method
 7462   // static int implDilithiumNttMult(
 7463   //              int[] product, int[] coeffs1, int[] coeffs2) {}
 7464   // of the sun.security.provider.ML_DSA class.
 7465   //
 7466   // result (int[256]) = c_rarg0
 7467   // poly1 (int[256]) = c_rarg1
 7468   // poly2 (int[256]) = c_rarg2
 7469   address generate_dilithiumNttMult() {
 7470     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 7471     int entry_count = StubInfo::entry_count(stub_id);
 7472     assert(entry_count == 1, "sanity check");
 7473     address start = load_archive_data(stub_id);
 7474     if (start != nullptr) {
 7475       return start;
 7476     }
 7477     __ align(CodeEntryAlignment);
 7478     StubCodeMark mark(this, stub_id);
 7479     start = __ pc();
 7480     __ enter();
 7481 
 7482     Label L_loop;
 7483 
 7484     const Register result = c_rarg0;
 7485     const Register poly1 = c_rarg1;
 7486     const Register poly2 = c_rarg2;
 7487 
 7488     const Register dilithiumConsts = r10;
 7489     const Register len = r11;
 7490 
 7491     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7492     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7493     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7494     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 7495 
 7496     __ lea(dilithiumConsts,
 7497              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7498 
 7499     // load constants q, qinv
 7500     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7501     // load constant rSquare into v29
 7502     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 7503 
 7504     __ mov(len, zr);
 7505     __ add(len, len, 1024);
 7506 
 7507     __ BIND(L_loop);
 7508 
 7509     // b load 32 (8x4S) next inputs from poly1
 7510     vs_ldpq_post(vs1, poly1);
 7511     // c load 32 (8x4S) next inputs from poly2
 7512     vs_ldpq_post(vs2, poly2);
 7513     // compute a = b montmul c
 7514     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7515     // compute a = rsquare montmul a
 7516     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 7517     // save a 32 (8x4S) results
 7518     vs_stpq_post(vs2, result);
 7519 
 7520     __ sub(len, len, 128);
 7521     __ cmp(len, (u1)128);
 7522     __ br(Assembler::GE, L_loop);
 7523 
 7524     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7525     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7526     __ ret(lr);
 7527 
 7528     // record the stub entry and end
 7529     store_archive_data(stub_id, start, __ pc());
 7530 
 7531     return start;
 7532   }
 7533 
 7534   // Dilithium Montgomery multiply an array by a constant.
 7535   // A straightforward implementation of the method
 7536   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 7537   // of the sun.security.provider.ML_DSA class
 7538   //
 7539   // coeffs (int[256]) = c_rarg0
 7540   // constant (int) = c_rarg1
 7541   address generate_dilithiumMontMulByConstant() {
 7542     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 7543     int entry_count = StubInfo::entry_count(stub_id);
 7544     assert(entry_count == 1, "sanity check");
 7545     address start = load_archive_data(stub_id);
 7546     if (start != nullptr) {
 7547       return start;
 7548     }
 7549     __ align(CodeEntryAlignment);
 7550     StubCodeMark mark(this, stub_id);
 7551     start = __ pc();
 7552     __ enter();
 7553 
 7554     Label L_loop;
 7555 
 7556     const Register coeffs = c_rarg0;
 7557     const Register constant = c_rarg1;
 7558 
 7559     const Register dilithiumConsts = r10;
 7560     const Register result = r11;
 7561     const Register len = r12;
 7562 
 7563     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7564     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7565     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7566     VSeq<8> vconst(29, 0);             // for montmul by constant
 7567 
 7568     // results track inputs
 7569     __ add(result, coeffs, 0);
 7570     __ lea(dilithiumConsts,
 7571              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7572 
 7573     // load constants q, qinv -- they do not get clobbered by first two loops
 7574     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7575     // copy caller supplied constant across vconst
 7576     __ dup(vconst[0], __ T4S, constant);
 7577     __ mov(len, zr);
 7578     __ add(len, len, 1024);
 7579 
 7580     __ BIND(L_loop);
 7581 
 7582     // load next 32 inputs
 7583     vs_ldpq_post(vs2, coeffs);
 7584     // mont mul by constant
 7585     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 7586     // write next 32 results
 7587     vs_stpq_post(vs2, result);
 7588 
 7589     __ sub(len, len, 128);
 7590     __ cmp(len, (u1)128);
 7591     __ br(Assembler::GE, L_loop);
 7592 
 7593     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7594     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7595     __ ret(lr);
 7596 
 7597     // record the stub entry and end
 7598     store_archive_data(stub_id, start, __ pc());
 7599 
 7600     return start;
 7601   }
 7602 
 7603   // Dilithium decompose poly.
 7604   // Implements the method
 7605   //    static int implDilithiumDecomposePoly(int[] input, int[] lowPart, int[] highPart,
 7606   //                                          int twoGamma2, int multiplier) {
 7607   // of the sun.security.provider.ML_DSA class
 7608   //
 7609   // input (int[256]) = c_rarg0
 7610   // lowPart (int[256]) = c_rarg1
 7611   // highPart (int[256]) = c_rarg2
 7612   // twoGamma2  (int) = c_rarg3
 7613   // multiplier (int) = c_rarg4
 7614   address generate_dilithiumDecomposePoly() {
 7615     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 7616     int entry_count = StubInfo::entry_count(stub_id);
 7617     assert(entry_count == 1, "sanity check");
 7618     address start = load_archive_data(stub_id);
 7619     if (start != nullptr) {
 7620       return start;
 7621     }
 7622     __ align(CodeEntryAlignment);
 7623     StubCodeMark mark(this, stub_id);
 7624     start = __ pc();
 7625     Label L_loop;
 7626 
 7627     const Register input = c_rarg0;
 7628     const Register lowPart = c_rarg1;
 7629     const Register highPart = c_rarg2;
 7630     const Register twoGamma2 = c_rarg3;
 7631     const Register multiplier = c_rarg4;
 7632 
 7633     const Register len = r9;
 7634     const Register dilithiumConsts = r10;
 7635     const Register tmp = r11;
 7636 
 7637     // 6 independent sets of 4x4s values
 7638     VSeq<4> vs1(0), vs2(4), vs3(8);
 7639     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7640 
 7641     // 7 constants for cross-multiplying
 7642     VSeq<4> one(25, 0);
 7643     VSeq<4> qminus1(26, 0);
 7644     VSeq<4> g2(27, 0);
 7645     VSeq<4> twog2(28, 0);
 7646     VSeq<4> mult(29, 0);
 7647     VSeq<4> q(30, 0);
 7648     VSeq<4> qadd(31, 0);
 7649 
 7650     __ enter();
 7651 
 7652     __ lea(dilithiumConsts,
 7653              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7654 
 7655     // save callee-saved registers
 7656     __ stpd(v8, v9, __ pre(sp, -64));
 7657     __ stpd(v10, v11, Address(sp, 16));
 7658     __ stpd(v12, v13, Address(sp, 32));
 7659     __ stpd(v14, v15, Address(sp, 48));
 7660 
 7661     // populate constant registers
 7662     __ mov(tmp, zr);
 7663     __ add(tmp, tmp, 1);
 7664     __ dup(one[0], __ T4S, tmp); // 1
 7665     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7666     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7667     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7668     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7669     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7670     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7671 
 7672     __ mov(len, zr);
 7673     __ add(len, len, 1024);
 7674 
 7675     __ BIND(L_loop);
 7676 
 7677     // load next 4x4S inputs interleaved: rplus --> vs1
 7678     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7679 
 7680     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7681     vs_addv(vtmp, __ T4S, vs1, qadd);
 7682     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7683     vs_mulv(vtmp, __ T4S, vtmp, q);
 7684     vs_subv(vs1, __ T4S, vs1, vtmp);
 7685 
 7686     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7687     vs_sshr(vtmp, __ T4S, vs1, 31);
 7688     vs_andr(vtmp, vtmp, q);
 7689     vs_addv(vs1, __ T4S, vs1, vtmp);
 7690 
 7691     // quotient --> vs2
 7692     // int quotient = (rplus * multiplier) >> 22;
 7693     vs_mulv(vtmp, __ T4S, vs1, mult);
 7694     vs_sshr(vs2, __ T4S, vtmp, 22);
 7695 
 7696     // r0 --> vs3
 7697     // int r0 = rplus - quotient * twoGamma2;
 7698     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7699     vs_subv(vs3, __ T4S, vs1, vtmp);
 7700 
 7701     // mask --> vs4
 7702     // int mask = (twoGamma2 - r0) >> 22;
 7703     vs_subv(vtmp, __ T4S, twog2, vs3);
 7704     vs_sshr(vs4, __ T4S, vtmp, 22);
 7705 
 7706     // r0 -= (mask & twoGamma2);
 7707     vs_andr(vtmp, vs4, twog2);
 7708     vs_subv(vs3, __ T4S, vs3, vtmp);
 7709 
 7710     // quotient += (mask & 1);
 7711     vs_andr(vtmp, vs4, one);
 7712     vs_addv(vs2, __ T4S, vs2, vtmp);
 7713 
 7714     // mask = (twoGamma2 / 2 - r0) >> 31;
 7715     vs_subv(vtmp, __ T4S, g2, vs3);
 7716     vs_sshr(vs4, __ T4S, vtmp, 31);
 7717 
 7718     // r0 -= (mask & twoGamma2);
 7719     vs_andr(vtmp, vs4, twog2);
 7720     vs_subv(vs3, __ T4S, vs3, vtmp);
 7721 
 7722     // quotient += (mask & 1);
 7723     vs_andr(vtmp, vs4, one);
 7724     vs_addv(vs2, __ T4S, vs2, vtmp);
 7725 
 7726     // r1 --> vs5
 7727     // int r1 = rplus - r0 - (dilithium_q - 1);
 7728     vs_subv(vtmp, __ T4S, vs1, vs3);
 7729     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7730 
 7731     // r1 --> vs1 (overwriting rplus)
 7732     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7733     vs_negr(vtmp, __ T4S, vs5);
 7734     vs_orr(vtmp, vs5, vtmp);
 7735     vs_sshr(vs1, __ T4S, vtmp, 31);
 7736 
 7737     // r0 += ~r1;
 7738     vs_notr(vtmp, vs1);
 7739     vs_addv(vs3, __ T4S, vs3, vtmp);
 7740 
 7741     // r1 = r1 & quotient;
 7742     vs_andr(vs1, vs2, vs1);
 7743 
 7744     // store results interleaved
 7745     // lowPart[m] = r0;
 7746     // highPart[m] = r1;
 7747     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7748     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7749 
 7750     __ sub(len, len, 64);
 7751     __ cmp(len, (u1)64);
 7752     __ br(Assembler::GE, L_loop);
 7753 
 7754     // restore callee-saved vector registers
 7755     __ ldpd(v14, v15, Address(sp, 48));
 7756     __ ldpd(v12, v13, Address(sp, 32));
 7757     __ ldpd(v10, v11, Address(sp, 16));
 7758     __ ldpd(v8, v9, __ post(sp, 64));
 7759 
 7760     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7761     __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
 7762     __ ret(lr);
 7763 
 7764     // record the stub entry and end
 7765     store_archive_data(stub_id, start, __ pc());
 7766 
 7767     return start;
 7768   }
 7769 
 7770   static constexpr int montMulP256Shift1 = 12; // 64 - bits per limb
 7771   static constexpr int montMulP256Shift2 = 52; // bits per limb
 7772   // stack space needed for carry computation
 7773   static constexpr int cDataSize = 6 * BytesPerLong;
 7774   // stack space needed for data computed by the neon side
 7775   static constexpr int mulDataSize = 16 * BytesPerLong;
 7776 
 7777 
 7778   // Subroutine used by the 52 x 52 bit multiplication algorithm in
 7779   // generate_intpoly_montgomeryMult_P256().
 7780   // This function computes partial results of eight 52 x 52 bit multiplications,
 7781   // where the multiplicands are stored as 64-bit values, specifically
 7782   // (b_0, b_1, b_2, b_3) * (a_3, a_4). (The 4 calls to this function
 7783   // together provide the results of these limb-multiplications.)
 7784   // Calls to this function accept either the low 32 bits or high 20 bits
 7785   // of each b_i packed into bs in ascending order. a_3 and a_4 are packed
 7786   // into successive 64 bit elements of as. lane selects the low 32 or high
 7787   // 20 bits of each a_j value. So four calls with the appropriate parameters
 7788   // will produce the 64-bit low32 * low32, low32 * high20, high20 * low32,
 7789   // high20 * high20 values in the output register sequences vs. The
 7790   // 64-bit partial products are returned in vs in ascending order:
 7791   // vs[0] = (b_0*a_3, b_1*a_3) . . .  vs[3] = (b_2*a_4, b_3*a_4)
 7792 
 7793   void neon_partial_mult_64(const VSeq<4>& vs, FloatRegister bs, FloatRegister as, int lane_lo) {
 7794     __ umullv(vs[0], __ T2D, bs, __ T2S, as, __ S, lane_lo);
 7795     __ umull2v(vs[1], __ T2D, bs, __ T4S, as, __ S, lane_lo);
 7796     __ umullv(vs[2], __ T2D, bs, __ T2S, as, __ S, lane_lo + 2);
 7797     __ umull2v(vs[3], __ T2D, bs, __ T4S, as, __ S, lane_lo + 2);
 7798   }
 7799 
 7800     // Subroutine used by the generate_intpoly_montgomeryMult_P256() function
 7801     // to compute the result of a 52 x 52 bit multiplications where the
 7802     // multiplicands, a and b are available as 64-bit values.
 7803     // The result is going to two 64-bit registers lo (least significant 52 bits)
 7804     // and hi (most significant 52 bits).
 7805     void gpr_partial_mult_52(Register a, Register b, Register hi, Register lo,
 7806      Register mask) {
 7807       // compute 104-bit (40 + 64) full product
 7808       __ umulh(hi, a, b);
 7809       __ mul(lo, a, b);
 7810       // combine 40 + 12 bits into hi result
 7811       // on certain implementations of aarch64 (e.g. apple M1) replacing extr()
 7812       // with the following equivalent instruction sequence the performance
 7813       // improves slightly (despite it is two instructions longer and needs
 7814       // an additional register)
 7815       //      __ lsl(hi, hi, montMulP256Shift1);
 7816       //      __ lsr(tmp, lo, montMulP256Shift2);
 7817       //      __ orr(hi, hi, tmp);
 7818       __ extr(hi, hi, lo, montMulP256Shift2);
 7819       // mask off 52 bits of lo result
 7820       __ andr(lo, lo, mask);
 7821     }
 7822 
 7823   // This assembly follows the Java code in MontgomeryIntegerPolynomial256.mult()
 7824   // quite closely. The main difference is that the computations done with the
 7825   // last two limbs of `a` are done using Neon registers. This allows us to take
 7826   // advantage of both the Neon registers and GPRs simultaneously.
 7827   // It is also worth noting that since Neon does not support 64 bit
 7828   // multiplication, we split each 64 bit value into lower and upper halves
 7829   // and use the "schoolbook" multiplication algorithm.
 7830   address generate_intpoly_montgomeryMult_P256() {
 7831     assert(UseIntPolyIntrinsics, "what are we doing here?");
 7832     StubId stub_id = StubId::stubgen_intpoly_montgomeryMult_P256_id;
 7833     int entry_count = StubInfo::entry_count(stub_id);
 7834     assert(entry_count == 1, "sanity check");
 7835     address start = load_archive_data(stub_id);
 7836     if (start != nullptr) {
 7837       return start;
 7838     }
 7839     __ align(CodeEntryAlignment);
 7840     StubCodeMark mark(this, stub_id);
 7841     start = __ pc();
 7842     __ enter();
 7843 
 7844     // Registers that are used throughout entire routine
 7845     const Register a = c_rarg0;
 7846     const Register b = c_rarg1;
 7847     const Register result = c_rarg2;
 7848 
 7849     RegSet regs = RegSet::range(r0, r28) - rscratch1 - rscratch2
 7850       - r16 - r17 - r18_tls - a - b - result;
 7851 
 7852     auto common_regs = regs.begin();
 7853     Register limb_mask = *common_regs++,
 7854       c_ptr = *common_regs++,
 7855       mod_0 = *common_regs++,
 7856       mod_1 = *common_regs++,
 7857       mod_3 = *common_regs++,
 7858       mod_4 = *common_regs++,
 7859       b_0 = *common_regs++,
 7860       b_1 = *common_regs++,
 7861       b_2 = *common_regs++,
 7862       b_3 = *common_regs++,
 7863       b_4 = *common_regs++;
 7864 
 7865     FloatRegSet floatRegs = FloatRegSet::range(v0, v31)
 7866       - FloatRegSet::range(v8, v15)   // Caller saved vectors
 7867       - FloatRegSet::range(v16, v31); // Manually-allocated vectors
 7868 
 7869     auto common_vectors = floatRegs.begin();
 7870     FloatRegister limb_mask_vec = *common_vectors++,
 7871       b_lows = *common_vectors++,
 7872       b_highs = *common_vectors++,
 7873       a_vals = *common_vectors++;
 7874 
 7875     // Push callee saved registers on to the stack
 7876     RegSet callee_saved = RegSet::range(r19, r28);
 7877     __ push(callee_saved, sp);
 7878 
 7879     // Allocate space on the stack for carry values
 7880     __ sub(sp, sp, cDataSize);
 7881     __ mov(c_ptr, sp);
 7882 
 7883     // Calculate (52-bit) limb masks for both gpr and vector registers
 7884     __ mov(limb_mask, -UCONST64(1) >> montMulP256Shift1);
 7885     __ dup(limb_mask_vec, __ T2D, limb_mask);
 7886 
 7887     //Load input arrays and modulus
 7888     Register a_ptr = *common_regs++, mod_ptr = *common_regs++;
 7889      // skip 3 limbs so a_ptr addresses trailing pair {a3, a4}
 7890     __ add(a_ptr, a, 3 * BytesPerLong);
 7891     __ lea(mod_ptr, ExternalAddress((address)_modulus_P256));
 7892     __ ldr(b_0, Address(b));
 7893     __ ldr(b_1, Address(b, BytesPerLong));
 7894     __ ldr(b_2, Address(b, 2 * BytesPerLong));
 7895     __ ldr(b_3, Address(b, 3 * BytesPerLong));
 7896     __ ldr(b_4, Address(b, 4 * BytesPerLong));
 7897     __ ldr(mod_0, __ post(mod_ptr, BytesPerLong));
 7898     __ ldr(mod_1, __ post(mod_ptr, BytesPerLong));
 7899     __ ldr(mod_3, __ post(mod_ptr, BytesPerLong));
 7900     __ ldr(mod_4, mod_ptr);
 7901     __ ld1(a_vals, __ T2D, a_ptr);
 7902     // use an interleaved load to group low 32 bits and high 20 bits
 7903     // of 4 successive b values into two vector registers
 7904     // n.b. these are the same inputs as the ones in b_0 ... b4
 7905     __ ld2(b_lows, b_highs, __ T4S, b);
 7906     common_regs = common_regs.remaining()
 7907       + a_ptr + mod_ptr;
 7908         a_ptr = mod_ptr = noreg;
 7909 
 7910     //Regs used throughout the main "loop", which is partially unrolled here
 7911     Register high = *common_regs++,
 7912       low = *common_regs++,
 7913       mul_ptr = *common_regs++,
 7914       mod_high = *common_regs++,
 7915       mod_low = *common_regs++,
 7916       a_i = *common_regs++,
 7917       c_i = *common_regs++,
 7918       tmp = *common_regs++,
 7919       n = *common_regs++;
 7920 
 7921     // vector sequences used to compute and combine partial products of
 7922     // b_i * a_j for i = {0,1,2,3} j = {3,4}
 7923     VSeq<4> A(16);
 7924     VSeq<4> B(20);
 7925     VSeq<4> C(24);
 7926     VSeq<4> D(28);
 7927 
 7928 
 7929     // neon and gpr computations are interleaved to maximize parallelism
 7930 
 7931     // allocate stack space for the neon results
 7932     __ sub(sp, sp, mulDataSize);
 7933     __ mov(mul_ptr, sp);
 7934 
 7935     // cross-multiply low * low for limbs b0-b3 and a3-a4 in parallel
 7936     neon_partial_mult_64(A, b_lows, a_vals, 0);
 7937 
 7938     // Limb 0
 7939     __ ldr(a_i, __ post(a, BytesPerLong));
 7940     gpr_partial_mult_52(a_i, b_0, high, low, limb_mask);
 7941     __ mov(n, low);
 7942    // __ andr(n, low, limb_mask);
 7943 
 7944     // cross-multiply high * low for limbs b0-b3 and a3-a4 in parallel
 7945     neon_partial_mult_64(B, b_highs, a_vals, 0);
 7946 
 7947     // Limb 0 modulus computation
 7948     // n.b. modulus computation requires multiplying successive
 7949     // limbs of the product by corresponding limbs of the p256
 7950     // prime adding the result to the limb and folding this
 7951     // partial result into a running 256-bit sum in c_i. Limbs
 7952     // of c_i are stored via c_ptr once carries are included.
 7953     // n.b. the mul + add is omitted for limb 2 since the
 7954     // corresponding prime bits are zero.
 7955     gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
 7956     __ add(low, low, mod_low);
 7957     __ add(high, high, mod_high);
 7958     __ lsr(c_i, low, montMulP256Shift2);
 7959     __ add(c_i, c_i, high);
 7960 
 7961     // cross-multiply low * high for limbs b0-b3 and a3-a4 in parallel
 7962     neon_partial_mult_64(C, b_lows, a_vals, 1);
 7963 
 7964     // Limb 1
 7965     gpr_partial_mult_52(a_i, b_1, high, low, limb_mask);
 7966 
 7967     // cross-multiply high * high for limbs b0-b3 and a3-a4 in parallel
 7968     neon_partial_mult_64(D, b_highs, a_vals, 1);
 7969 
 7970     gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
 7971     __ add(low, low, mod_low);
 7972     __ add(high, high, mod_high);
 7973     __ add(c_i, c_i, low);
 7974     __ str(c_i, c_ptr);
 7975     __ mov(c_i, high);
 7976 
 7977     // combine neon 32-bit partial products, regrouping to produce
 7978     // 8*52-bit low products in A and 8*52-bit high products in D
 7979 
 7980     // add low*high/high*low intermediate products before regrouping
 7981     vs_addv(B, __ T2D, B, C); // Store (B+C) in B
 7982 
 7983     // Limb 2
 7984     gpr_partial_mult_52(a_i, b_2, high, low, limb_mask);
 7985     __ add(c_i, c_i, low);
 7986     __ str(c_i, Address(c_ptr, 8));
 7987     __ mov(c_i, high);
 7988 
 7989     // shift high*high (40-bit) product up into 52-bits of output
 7990     vs_shl(D, __ T2D, D, montMulP256Shift1);
 7991 
 7992     // Limb 3
 7993     gpr_partial_mult_52(a_i, b_3, high, low, limb_mask);
 7994 
 7995     // shift high 32 (or 33) bits of intermediate products for addition to D
 7996     vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) >>> 20)
 7997 
 7998     gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
 7999     __ add(low, low, mod_low);
 8000     __ add(high, high, mod_high);
 8001     __ add(c_i, c_i, low);
 8002     __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
 8003     __ mov(c_i, high);
 8004 
 8005     // shift low 32 bits of intermediate product up for masking and addition to A
 8006     vs_shl(B, __ T2D, B, 32);
 8007 
 8008     // Limb 4
 8009     gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
 8010 
 8011     // add high bits of intermediate product into D
 8012     vs_addv(D, __ T2D, D, C);
 8013 
 8014     gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
 8015     __ add(low, low, mod_low);
 8016     __ add(high, high, mod_high);
 8017     __ add(c_i, c_i, low);
 8018     __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
 8019     __ str(high, Address(c_ptr, 4 * BytesPerLong));
 8020 
 8021     // top 12 bits of 32*32 bit product in A need adding into high 52-bit output
 8022     vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
 8023     // Only 20 of the 32 bits now in the top of B should be added into A
 8024     vs_andr(B, B, limb_mask_vec);
 8025     // reduce original 64-bit product to 52-bits
 8026     vs_andr(A, A, limb_mask_vec);
 8027     // add intermediate products to high 52-bit result in D
 8028     vs_addv(D, __ T2D, D, C);
 8029     // add 20/21 bits of intermediate product in top of B into low 52-bit result
 8030     vs_addv(A, __ T2D, A, B);
 8031     // save and then mask off any overflow bit from computing low 52-bit result
 8032     vs_ushr(B, __ T2D, A, montMulP256Shift2);
 8033     vs_andr(A, A, limb_mask_vec);
 8034     // add any remaining carry into the high 52-bit result
 8035     vs_addv(D, __ T2D, D, B);
 8036 
 8037     // the write interleaves the 4 successive pairs of low and
 8038     // high results: (l0, l1), (h0, h1), ... (l6, l7), (h6, h7)
 8039     vs_st1_interleaved(A, D, mul_ptr);
 8040 
 8041     // Free mul_ptr
 8042     common_regs = common_regs.remaining() + mul_ptr;
 8043     mul_ptr = noreg;
 8044 
 8045     /////////////////////////
 8046     // Loop 2 & 3
 8047     /////////////////////////
 8048 
 8049     for (int i = 0; i < 2; i++) {
 8050       // Load a_i and increment by 8 bytes
 8051       __ ldr(a_i, __ post(a, BytesPerLong));
 8052       __ ldr(c_i, c_ptr); //Load prior c_i
 8053 
 8054       // Limb 0
 8055       gpr_partial_mult_52(a_i, b_0, high, low, limb_mask);
 8056       __ add(low, low, c_i);
 8057       __ ldr(c_i, Address(c_ptr, BytesPerLong));
 8058       __ andr(n, low, limb_mask);
 8059       gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
 8060       __ add(low, low, mod_low);
 8061       __ add(high, high, mod_high);
 8062       __ lsr(tmp, low, montMulP256Shift2);
 8063       __ add(c_i, c_i, tmp);
 8064       __ add(c_i, c_i, high);
 8065 
 8066       // Limb 1
 8067       gpr_partial_mult_52(a_i, b_1, high, low, limb_mask);
 8068       gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
 8069       __ ldr(tmp, Address(c_ptr, 2 * BytesPerLong));
 8070       __ add(low, low, mod_low);
 8071       __ add(high, high, mod_high);
 8072       __ add(c_i, c_i, low);
 8073       __ str(c_i, c_ptr);
 8074       __ add(c_i, tmp, high);
 8075 
 8076       // Limb 2
 8077       gpr_partial_mult_52(a_i, b_2, high, low, limb_mask);
 8078       __ ldr(tmp, Address(c_ptr, 3 * BytesPerLong));
 8079       __ add(c_i, c_i, low);
 8080       __ str(c_i, Address(c_ptr, BytesPerLong));
 8081       __ add(c_i, tmp, high);
 8082 
 8083       // Limb 3
 8084       gpr_partial_mult_52(a_i, b_3, high, low, limb_mask);
 8085       gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
 8086       __ ldr(tmp, Address(c_ptr, 4 * BytesPerLong));
 8087       __ add(low, low, mod_low);
 8088       __ add(high, high, mod_high);
 8089       __ add(c_i, c_i, low);
 8090       __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
 8091       __ add(c_i, tmp, high);
 8092 
 8093       // Limb 4
 8094       gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
 8095       gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
 8096       __ add(low, low, mod_low);
 8097       __ add(high, high, mod_high);
 8098       __ add(c_i, c_i, low);
 8099       __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
 8100       __ str(high, Address(c_ptr, 4 * BytesPerLong));
 8101     }
 8102     // Reallocate regs b_0, b_1, b_2 and b_3
 8103         common_regs = common_regs.remaining()
 8104           + b_0 + b_1 + b_2 + b_3;
 8105             b_0 = b_1 = b_2 = b_3 = noreg;
 8106 
 8107     Register low_1 = *common_regs++;
 8108     Register high_1 = *common_regs++;
 8109 
 8110     //////////////////////////////
 8111     // a[3]
 8112     //////////////////////////////
 8113 
 8114     // For a_3 and a_4 we have already computed the cross-products
 8115     // with b_0 ... b_3 and stored them on the stack relative to
 8116     // `mul_ptr` i.e. the current `sp`in the order
 8117     // l(a_3 * b_0), l(a_3 * b_1), h(a_3 * b_0), h(a_3 * b_1),
 8118     // l(a_3 * b_2), l(a_3 * b_3), h(a_3 * b_2), h(a_3 * b_3),
 8119     // l(a_4 * b_0), l(a_4 * b_1), h(a_4 * b_0), h(a_4 * b_1),
 8120     // l(a_4 * b_2), l(a_4 * b_3), h(a_4 * b_2), h(a_4 * b_3),
 8121     // where l(x) is the low 52 bits of x and h(x) is the high 52 bits
 8122 
 8123     __ ldr(low_1, Address(sp));
 8124     __ ldr(high_1, Address(sp, 2 * BytesPerLong));
 8125 
 8126     __ ldr(low, Address(sp, BytesPerLong));
 8127     __ ldr(high, Address(sp, 3 * BytesPerLong));
 8128     __ ldr(a_i, __ post(a, BytesPerLong));
 8129     __ ldr(c_i, c_ptr);
 8130 
 8131     // Limb 0
 8132     __ add(low_1, low_1, c_i);
 8133     __ ldr(c_i, Address(c_ptr, BytesPerLong));
 8134     __ andr(n, low_1, limb_mask);
 8135     gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
 8136     __ add(low_1, low_1, mod_low);
 8137     __ add(high_1, high_1, mod_high);
 8138     __ lsr(tmp, low_1, montMulP256Shift2);
 8139     __ add(c_i, c_i, tmp);
 8140     __ add(c_i, c_i, high_1);
 8141 
 8142     // Limb 1
 8143     __ ldr(low_1, Address(sp, 4 * BytesPerLong));
 8144     __ ldr(high_1, Address(sp, 6 * BytesPerLong));
 8145     gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
 8146     __ ldr(tmp, Address(c_ptr, 2 * BytesPerLong));
 8147     __ andr(mod_low, mod_low, limb_mask);
 8148     __ add(low, low, mod_low);
 8149     __ add(high, high, mod_high);
 8150     __ add(c_i, c_i, low);
 8151     __ str(c_i, c_ptr);
 8152     __ add(c_i, tmp, high);
 8153 
 8154     // Limb 2
 8155     __ ldr(low, Address(sp, 5 * BytesPerLong));
 8156     __ ldr(high, Address(sp, 7 * BytesPerLong));
 8157     __ ldr(tmp, Address(c_ptr, 3 * BytesPerLong));
 8158     __ add(c_i, c_i, low_1);
 8159     __ str(c_i, Address(c_ptr, BytesPerLong));
 8160     __ add(c_i, tmp, high_1);
 8161 
 8162     // Limb 3
 8163     gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
 8164     __ ldr(tmp, Address(c_ptr, 4 * BytesPerLong));
 8165     __ add(low, low, mod_low);
 8166     __ add(high, high, mod_high);
 8167     __ add(c_i, c_i, low);
 8168     __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
 8169     __ add(c_i, tmp, high);
 8170 
 8171     // Limb 4
 8172     __ ldr(low, Address(sp, 8 * BytesPerLong));
 8173     __ ldr(high, Address(sp, 10 * BytesPerLong));
 8174     gpr_partial_mult_52(a_i, b_4, high_1, low_1, limb_mask);
 8175     gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
 8176     __ add(low_1, low_1, mod_low);
 8177     __ add(high_1, high_1, mod_high);
 8178     __ add(c_i, c_i, low_1);
 8179     __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
 8180     __ str(high_1, Address(c_ptr, 4 * BytesPerLong));
 8181 
 8182     //////////////////////////////
 8183     // a[4]
 8184     //////////////////////////////
 8185 
 8186     Register c5 = *common_regs++,
 8187       c6 = *common_regs++,
 8188       c7 = *common_regs++;
 8189 
 8190     __ ldr(a_i, a);
 8191     __ ldr(c_i, c_ptr);
 8192 
 8193     // Limb 0
 8194     __ ldr(low_1, Address(sp, 9 * BytesPerLong));
 8195     __ ldr(high_1, Address(sp, 11 * BytesPerLong));
 8196 
 8197     __ add(low, low, c_i);
 8198     __ ldr(c_i, Address(c_ptr, BytesPerLong));
 8199     __ andr(n, low, limb_mask);
 8200     gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
 8201     __ add(low, low, mod_low);
 8202     __ add(high, high, mod_high);
 8203     __ lsr(tmp, low, montMulP256Shift2);
 8204     __ add(c_i, c_i, tmp);
 8205     __ add(c_i, c_i, high);
 8206 
 8207     __ ldr(low, Address(sp, 12 * BytesPerLong));
 8208     __ ldr(high, Address(sp, 14 * BytesPerLong));
 8209     gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
 8210     __ add(low_1, low_1, mod_low);
 8211     __ add(high_1, high_1, mod_high);
 8212     __ add(c5, c_i, low_1);
 8213     __ ldr(c_i, Address(c_ptr, 2 * BytesPerLong));
 8214     __ lsr(tmp, c5, montMulP256Shift2);
 8215     __ add(c_i, c_i, tmp);
 8216     __ add(c_i, c_i, high_1);
 8217 
 8218     // Limb 2
 8219     __ ldr(low_1, Address(sp, 13 * BytesPerLong));
 8220     __ ldr(high_1, Address(sp, 15 * BytesPerLong));
 8221     __ add(c6, c_i, low);
 8222     __ ldr(c_i, Address(c_ptr, 3 * BytesPerLong));
 8223     __ lsr(tmp, c6, montMulP256Shift2);
 8224     __ add(c_i, c_i, tmp);
 8225     __ add(c_i, c_i, high);
 8226 
 8227     // Limb 3
 8228     gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
 8229     __ add(low_1, low_1, mod_low);
 8230     __ add(high_1, high_1, mod_high);
 8231     __ add(c7, c_i, low_1);
 8232     __ ldr(c_i, Address(c_ptr, 4 * BytesPerLong));
 8233     __ lsr(tmp, c7, montMulP256Shift2);
 8234     __ add(c_i, c_i, tmp);
 8235     __ add(c_i, c_i, high_1);
 8236 
 8237     // Limb 4
 8238     gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
 8239     gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
 8240     __ add(low, low, mod_low);
 8241     __ add(high, high, mod_high);
 8242 
 8243     // Reallocate b_4
 8244     common_regs = common_regs.remaining() + b_4;
 8245     b_4 = noreg;
 8246 
 8247     Register c8 = *common_regs++,
 8248       c9 = *common_regs++;
 8249 
 8250     __ add(c8, c_i, low);
 8251     __ lsr(c9, c8, montMulP256Shift2);
 8252     __ add(c9, c9, high);
 8253 
 8254     __ andr(c5, c5, limb_mask);
 8255     __ andr(c6, c6, limb_mask);
 8256     __ andr(c7, c7, limb_mask);
 8257     __ andr(c8, c8, limb_mask);
 8258 
 8259     /////////////////////////////
 8260     // Final carry propagate
 8261     /////////////////////////////
 8262 
 8263     // c0 = c5 - modulus[0];
 8264     // c1 = c6 - modulus[1] + (c0 >> BITS_PER_LIMB);
 8265     // c0 &= LIMB_MASK;
 8266     // c2 = c7 + (c1 >> BITS_PER_LIMB);
 8267     // c1 &= LIMB_MASK;
 8268     // c3 = c8 - modulus[3] + (c2 >> BITS_PER_LIMB);
 8269     // c2 &= LIMB_MASK;
 8270     // c4 = c9 - modulus4] + (c3 >> BITS_PER_LIMB);
 8271     // c3 &= LIMB_MASK;
 8272 
 8273     // Free up all unused regs
 8274     common_regs = common_regs.remaining()
 8275       + c_ptr + low + high + mod_high
 8276       + mod_low + a_i + c_i + n + low_1 + high_1;
 8277         c_ptr = low = high = mod_high
 8278       = mod_low = a_i = c_i = n = low_1 = high_1 = noreg;
 8279 
 8280     Register c0 = *common_regs++,
 8281       c1 = *common_regs++,
 8282       c2 = *common_regs++,
 8283       c3 = *common_regs++,
 8284       c4 = *common_regs++;
 8285 
 8286     __ sub(c0, c5, mod_0);
 8287     __ sub(c1, c6, mod_1);
 8288     __ sub(c3, c8, mod_3);
 8289     __ sub(c4, c9, mod_4);
 8290     __ add(c1, c1, c0, Assembler::ASR, montMulP256Shift2);
 8291     __ andr(c0, c0, limb_mask);
 8292     __ add(c2, c7, c1, Assembler::ASR, montMulP256Shift2);
 8293     __ andr(c1, c1, limb_mask);
 8294     __ add(c3, c3, c2, Assembler::ASR, montMulP256Shift2);
 8295     __ andr(c2, c2, limb_mask);
 8296     __ add(c4, c4, c3, Assembler::ASR, montMulP256Shift2);
 8297     __ andr(c3, c3, limb_mask);
 8298 
 8299     // Final write back
 8300     // mask = c4 >> 63
 8301     // r[0] = ((c5 & mask) | (c0 & ~mask));
 8302     // r[1] = ((c6 & mask) | (c1 & ~mask));
 8303     // r[2] = ((c7 & mask) | (c2 & ~mask));
 8304     // r[3] = ((c8 & mask) | (c3 & ~mask));
 8305     // r[4] = ((c9 & mask) | (c4 & ~mask));
 8306 
 8307     common_regs = common_regs.remaining()
 8308       + mod_0 + mod_1 + mod_3 + mod_4;
 8309         mod_0 = mod_1 = mod_3 = mod_4 = noreg;
 8310 
 8311     Register mask = *common_regs++;
 8312     Register nmask = *common_regs++;
 8313 
 8314     __ asr(mask, c4, 63);
 8315     __ mvn(nmask, mask);
 8316     __ andr(c5, c5, mask);
 8317     __ andr(tmp, c0, nmask);
 8318     __ orr(c5, c5, tmp);
 8319     __ andr(c6, c6, mask);
 8320     __ andr(tmp, c1, nmask);
 8321     __ orr(c6, c6, tmp);
 8322     __ andr(c7, c7, mask);
 8323     __ andr(tmp, c2, nmask);
 8324     __ orr(c7, c7, tmp);
 8325     __ andr(c8, c8, mask);
 8326     __ andr(tmp, c3, nmask);
 8327     __ orr(c8, c8, tmp);
 8328     __ andr(c9, c9, mask);
 8329     __ andr(tmp, c4, nmask);
 8330     __ orr(c9, c9, tmp);
 8331 
 8332     __ str(c5, result);
 8333     __ str(c6, Address(result, BytesPerLong));
 8334     __ str(c7, Address(result, 2 * BytesPerLong));
 8335     __ str(c8, Address(result, 3 * BytesPerLong));
 8336     __ str(c9, Address(result, 4 * BytesPerLong));
 8337 
 8338     // End intrinsic call
 8339     __ add(sp, sp, cDataSize + mulDataSize);
 8340     __ pop(callee_saved, sp);
 8341     __ leave();
 8342     __ mov(r0, zr); // return 0
 8343     __ ret(lr);
 8344 
 8345     // record the stub entry and end
 8346     store_archive_data(stub_id, start, __ pc());
 8347 
 8348     return start;
 8349   }
 8350 
 8351   address generate_intpoly_assign() {
 8352     // KNOWN Lengths:
 8353     //   MontgomeryIntPolynP256:  5 = 4 + 1
 8354     //   IntegerPolynomial1305:   5 = 4 + 1
 8355     //   IntegerPolynomial25519: 10 = 8 + 2
 8356     //   IntegerPolynomialP256:  10 = 8 + 2
 8357     //   Curve25519OrderField:   10 = 8 + 2
 8358     //   Curve25519OrderField:   10 = 8 + 2
 8359     //   P256OrderField:         10 = 8 + 2
 8360     //   IntegerPolynomialP384:  14 = 8 + 4 + 2
 8361     //   P384OrderField:         14 = 8 + 4 + 2
 8362     //   IntegerPolynomial448:   16 = 8 + 8
 8363     //   Curve448OrderField:     16 = 8 + 8
 8364     //   Curve448OrderField:     16 = 8 + 8
 8365     //   IntegerPolynomialP521:  19 = 8 + 8 + 2 + 1
 8366     //   P521OrderField:         19 = 8 + 8 + 2 + 1
 8367     // Special Cases 5, 10, 14, 16, 19
 8368     assert(UseIntPolyIntrinsics, "what are we doing here?");
 8369     StubId stub_id = StubId::stubgen_intpoly_assign_id;
 8370     int entry_count = StubInfo::entry_count(stub_id);
 8371     assert(entry_count == 1, "sanity check");
 8372     address start = load_archive_data(stub_id);
 8373     if (start != nullptr) {
 8374       return start;
 8375     }
 8376 
 8377     __ align(CodeEntryAlignment);
 8378     StubCodeMark mark(this, stub_id);
 8379     start = __ pc();
 8380     __ enter();
 8381 
 8382     // Inputs
 8383     const Register set = c_rarg0;
 8384     const Register aLimbs = c_rarg1;
 8385     const Register bLimbs = c_rarg2;
 8386     const Register length = c_rarg3;
 8387 
 8388     Label L_Length5, L_Length10, L_Length14, L_Length16, L_Length19, L_Default, L_Done;
 8389 
 8390     /*
 8391     int maskValue = -set;
 8392     for (int i = 0; i < a.length; i++) {
 8393         long dummyLimbs = maskValue & (a[i] ^ b[i]);
 8394         a[i] = dummyLimbs ^ a[i];
 8395     }
 8396     */
 8397     Register mask_scalar = r4;
 8398     FloatRegister mask_vec = v0;
 8399 
 8400     __ neg(mask_scalar, set);
 8401     __ dup(mask_vec, __ T2D, mask_scalar);
 8402 
 8403     __ cmp(length, (u1)5);
 8404     __ br(Assembler::EQ, L_Length5);
 8405     __ cmp(length, (u1)10);
 8406     __ br(Assembler::EQ, L_Length10);
 8407     __ cmp(length, (u1)14);
 8408     __ br(Assembler::EQ, L_Length14);
 8409     __ cmp(length, (u1)16);
 8410     __ br(Assembler::EQ, L_Length16);
 8411     __ cmp(length, (u1)19);
 8412     __ br(Assembler::EQ, L_Length19);
 8413     __ b(L_Default);
 8414 
 8415 
 8416     // Length = 5
 8417     // Use 5 GPRs (neon not faster with this few limbs)
 8418     __ BIND(L_Length5);
 8419     {
 8420       Register a0 = r5;
 8421       Register a1 = r6;
 8422       Register a2 = r7;
 8423       Register a3 = r10;
 8424       Register a4 = r11;
 8425       Register b0 = r12;
 8426       Register b1 = r13;
 8427       Register b2 = r14;
 8428       Register b3 = r15;
 8429       Register b4 = r19;
 8430 
 8431       __ push(r19, sp);
 8432 
 8433       __ ldr(a0, aLimbs);
 8434       __ ldr(a1, Address(aLimbs, 1 * BytesPerLong));
 8435       __ ldr(a2, Address(aLimbs, 2 * BytesPerLong));
 8436       __ ldr(a3, Address(aLimbs, 3 * BytesPerLong));
 8437       __ ldr(a4, Address(aLimbs, 4 * BytesPerLong));
 8438 
 8439       __ ldr(b0, bLimbs);
 8440       __ ldr(b1, Address(bLimbs, 1 * BytesPerLong));
 8441       __ ldr(b2, Address(bLimbs, 2 * BytesPerLong));
 8442       __ ldr(b3, Address(bLimbs, 3 * BytesPerLong));
 8443       __ ldr(b4, Address(bLimbs, 4 * BytesPerLong));
 8444 
 8445       __ eor(b0, b0, a0);
 8446       __ eor(b1, b1, a1);
 8447       __ eor(b2, b2, a2);
 8448       __ eor(b3, b3, a3);
 8449       __ eor(b4, b4, a4);
 8450 
 8451       __ andr(b0, b0, mask_scalar);
 8452       __ andr(b1, b1, mask_scalar);
 8453       __ andr(b2, b2, mask_scalar);
 8454       __ andr(b3, b3, mask_scalar);
 8455       __ andr(b4, b4, mask_scalar);
 8456 
 8457       __ eor(a0, a0, b0);
 8458       __ eor(a1, a1, b1);
 8459       __ eor(a2, a2, b2);
 8460       __ eor(a3, a3, b3);
 8461       __ eor(a4, a4, b4);
 8462 
 8463       __ str(a0, aLimbs);
 8464       __ str(a1, Address(aLimbs, 1 * BytesPerLong));
 8465       __ str(a2, Address(aLimbs, 2 * BytesPerLong));
 8466       __ str(a3, Address(aLimbs, 3 * BytesPerLong));
 8467       __ str(a4, Address(aLimbs, 4 * BytesPerLong));
 8468 
 8469       __ pop(r19, sp);
 8470       __ b(L_Done);
 8471     }
 8472 
 8473     // Length = 10
 8474     // Split into 4 neon regs and 2 GPRs
 8475     __ BIND(L_Length10);
 8476     {
 8477       Register a9 = r10;
 8478       Register a10 = r11;
 8479       Register b9 = r12;
 8480       Register b10 = r13;
 8481 
 8482       VSeq<4> a_vec(16);
 8483       VSeq<4> b_vec(20);
 8484 
 8485       __ ldr(a9, Address(aLimbs, 8 * BytesPerLong));
 8486       __ ldr(a10, Address(aLimbs, 9 * BytesPerLong));
 8487       __ ldr(b9, Address(bLimbs, 8 * BytesPerLong));
 8488       __ ldr(b10, Address(bLimbs, 9 * BytesPerLong));
 8489 
 8490       vs_ldpq(a_vec, aLimbs);
 8491 
 8492       __ eor(b9, b9, a9);
 8493       __ eor(b10, b10, a10);
 8494 
 8495       vs_ldpq(b_vec, bLimbs);
 8496 
 8497       __ andr(b9, b9, mask_scalar);
 8498       __ andr(b10, b10, mask_scalar);
 8499 
 8500       vs_eor(b_vec, b_vec, a_vec);
 8501 
 8502       __ eor(a9, a9, b9);
 8503       __ eor(a10, a10, b10);
 8504 
 8505       vs_andr(b_vec, b_vec, mask_vec);
 8506 
 8507       __ str(a9, Address(aLimbs, 8 * BytesPerLong));
 8508       __ str(a10, Address(aLimbs, 9 * BytesPerLong));
 8509 
 8510       vs_eor(a_vec, a_vec, b_vec);
 8511       vs_stpq_post(a_vec, aLimbs);
 8512 
 8513       __ b(L_Done);
 8514     }
 8515 
 8516     // Length = 14
 8517     // Split into 5 neon regs and 4 GPRs
 8518     __ BIND(L_Length14);
 8519     {
 8520       Register a10 = r5;
 8521       Register a11 = r6;
 8522       Register a12 = r7;
 8523       Register a13 = r8;
 8524       Register b10 = r9;
 8525       Register b11 = r10;
 8526       Register b12 = r11;
 8527       Register b13 = r12;
 8528 
 8529       VSeq<5> a_vec(16);
 8530       VSeq<5> b_vec(22);
 8531 
 8532       int offsets[2] = { 0, 32 };
 8533 
 8534       __ ldr(a10, Address(aLimbs, 10 * BytesPerLong));
 8535       __ ldr(a11, Address(aLimbs, 11 * BytesPerLong));
 8536       __ ldr(a12, Address(aLimbs, 12 * BytesPerLong));
 8537       __ ldr(a13, Address(aLimbs, 13 * BytesPerLong));
 8538 
 8539       __ ldr(b10, Address(bLimbs, 10 * BytesPerLong));
 8540       __ ldr(b11, Address(bLimbs, 11 * BytesPerLong));
 8541       __ ldr(b12, Address(bLimbs, 12 * BytesPerLong));
 8542       __ ldr(b13, Address(bLimbs, 13 * BytesPerLong));
 8543 
 8544       __ ld1(a_vec[0], __ T2D, aLimbs);
 8545       vs_ldpq_indexed(vs_tail(a_vec), aLimbs, 16, offsets);
 8546 
 8547       __ eor(b10, b10, a10);
 8548       __ eor(b11, b11, a11);
 8549       __ eor(b12, b12, a12);
 8550       __ eor(b13, b13, a13);
 8551 
 8552       __ ld1(b_vec[0], __ T2D, bLimbs);
 8553       vs_ldpq_indexed(vs_tail(b_vec), bLimbs, 16, offsets);
 8554 
 8555       __ andr(b10, b10, mask_scalar);
 8556       __ andr(b11, b11, mask_scalar);
 8557       __ andr(b12, b12, mask_scalar);
 8558       __ andr(b13, b13, mask_scalar);
 8559 
 8560       vs_eor(b_vec, b_vec, a_vec);
 8561 
 8562       __ eor(a10, a10, b10);
 8563       __ eor(a11, a11, b11);
 8564       __ eor(a12, a12, b12);
 8565       __ eor(a13, a13, b13);
 8566 
 8567       vs_andr(b_vec, b_vec, mask_vec);
 8568 
 8569       __ str(a10, Address(aLimbs, 10 * BytesPerLong));
 8570       __ str(a11, Address(aLimbs, 11 * BytesPerLong));
 8571       __ str(a12, Address(aLimbs, 12 * BytesPerLong));
 8572       __ str(a13, Address(aLimbs, 13 * BytesPerLong));
 8573 
 8574       vs_eor(a_vec, a_vec, b_vec);
 8575 
 8576       __ st1(a_vec[0], __ T2D, aLimbs);
 8577       vs_stpq_indexed(vs_tail(a_vec), aLimbs, 16, offsets);
 8578 
 8579       __ b(L_Done);
 8580     }
 8581 
 8582     // Length = 16
 8583     // Use 8 neon regs
 8584     __ BIND(L_Length16);
 8585     {
 8586       VSeq<8> a_vec(16);
 8587       VSeq<8> b_vec(24);
 8588 
 8589       vs_ldpq(a_vec, aLimbs);
 8590       vs_ldpq(b_vec, bLimbs);
 8591       vs_eor(b_vec, b_vec, a_vec);
 8592       vs_andr(b_vec, b_vec, mask_vec);
 8593       vs_eor(a_vec, a_vec, b_vec);
 8594       vs_stpq_post(a_vec, aLimbs);
 8595 
 8596       __ b(L_Done);
 8597     }
 8598 
 8599     // Length = 19
 8600     // Split into 8 neon regs and 3 GPRs
 8601     __ BIND(L_Length19);
 8602     {
 8603       Register a17 = r10;
 8604       Register a18 = r11;
 8605       Register a19 = r12;
 8606       Register b17 = r13;
 8607       Register b18 = r14;
 8608       Register b19 = r15;
 8609 
 8610       VSeq<8> a_vec(16);
 8611       VSeq<8> b_vec(24);
 8612 
 8613       __ ldr(a17, Address(aLimbs, 16 * BytesPerLong));
 8614       __ ldr(a18, Address(aLimbs, 17 * BytesPerLong));
 8615       __ ldr(a19, Address(aLimbs, 18 * BytesPerLong));
 8616       __ ldr(b17, Address(bLimbs, 16 * BytesPerLong));
 8617       __ ldr(b18, Address(bLimbs, 17 * BytesPerLong));
 8618       __ ldr(b19, Address(bLimbs, 18 * BytesPerLong));
 8619 
 8620       vs_ldpq(a_vec, aLimbs);
 8621 
 8622       __ eor(b17, b17, a17);
 8623       __ eor(b18, b18, a18);
 8624       __ eor(b19, b19, a19);
 8625 
 8626       vs_ldpq(b_vec, bLimbs);
 8627 
 8628       __ andr(b17, b17, mask_scalar);
 8629       __ andr(b18, b18, mask_scalar);
 8630       __ andr(b19, b19, mask_scalar);
 8631 
 8632       vs_eor(b_vec, b_vec, a_vec);
 8633 
 8634       __ eor(a17, a17, b17);
 8635       __ eor(a18, a18, b18);
 8636       __ eor(a19, a19, b19);
 8637 
 8638       vs_andr(b_vec, b_vec, mask_vec);
 8639 
 8640       __ str(a17, Address(aLimbs, 16 * BytesPerLong));
 8641       __ str(a18, Address(aLimbs, 17 * BytesPerLong));
 8642       __ str(a19, Address(aLimbs, 18 * BytesPerLong));
 8643 
 8644       vs_eor(a_vec, a_vec, b_vec);
 8645       vs_stpq_post(a_vec, aLimbs);
 8646 
 8647       __ b(L_Done);
 8648     }
 8649 
 8650     __ BIND(L_Default);
 8651     {
 8652       Register ctr = r5;
 8653       Register a_val = r6;
 8654       Register b_val = r7;
 8655 
 8656       __ mov(ctr, length); // length (the number of limbs) is never 0
 8657 
 8658       Label default_loop;
 8659       __ BIND(default_loop);
 8660 
 8661       __ ldr(a_val, aLimbs);
 8662       __ ldr(b_val, __ post(bLimbs, 8));
 8663       __ eor(b_val, b_val, a_val);
 8664       __ andr(b_val, b_val, mask_scalar);
 8665       __ eor(a_val, a_val, b_val);
 8666       __ str(a_val, __ post(aLimbs, 8));
 8667       __ sub(ctr, ctr, 1);
 8668       __ cmp(ctr, (u1)0);
 8669       __ br(Assembler::NE, default_loop);
 8670     }
 8671 
 8672     __ BIND(L_Done);
 8673     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8674     __ mov(r0, zr); // return 0
 8675     __ ret(lr);
 8676 
 8677     // record the stub entry and end
 8678     store_archive_data(stub_id, start, __ pc());
 8679 
 8680     return start;
 8681   }
 8682 
 8683   /**
 8684    * Arithmetic polynomial multiplication in Curve25519.  The algorithm mimics
 8685    * the version in the IntegerPolynomial25519 class, including the use of all
 8686    * columns (no folding method).
 8687    *
 8688    * Arguments:
 8689    *
 8690    * Inputs:
 8691    *   c_rarg0   - long[] aLimbs
 8692    *   c_rarg1   - long[] bLimbs
 8693    *
 8694    * Output:
 8695    *   c_rarg2   - long[] rLimbs result
 8696    */
 8697   address generate_intpoly_mult_25519() {
 8698     StubId stub_id = StubId::stubgen_intpoly_mult_25519_id;
 8699     int entry_count = StubInfo::entry_count(stub_id);
 8700     assert(entry_count == 1, "sanity check");
 8701     address start = load_archive_data(stub_id);
 8702     if (start != nullptr) {
 8703       return start;
 8704     }
 8705     __ align(CodeEntryAlignment);
 8706     StubCodeMark mark(this, stub_id);
 8707     start = __ pc();
 8708     __ enter();
 8709 
 8710     // Register Map
 8711     const Register aLimbs  = c_rarg0; // r0
 8712     const Register bLimbs  = c_rarg1; // r1
 8713     const Register rLimbs  = c_rarg2; // r2
 8714 
 8715     Register c[]   = {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12};
 8716     Register a     = r13;
 8717     Register b     = r14;
 8718     Register term  = r15;
 8719     Register low   = r16;
 8720     Register high  = r17;
 8721 
 8722     const int32_t limbs      = 5;
 8723     const int32_t bpl        = 51;
 8724     const int32_t rem        = 64 - bpl;
 8725     const int32_t TERM       = 19;
 8726     const int32_t columns    = limbs * 2;
 8727     const uint64_t mask      = (uint64_t) -1 >> rem;
 8728     const uint64_t CARRY_ADD = (uint64_t) 1 << (bpl - 1);
 8729 
 8730     __ mov(term, TERM);
 8731     for (int i = 0; i < columns; i++) {
 8732       __ mov(c[i], zr);
 8733     }
 8734 
 8735     // Perform high/low multiplication with signed 5x51 bit limbs
 8736     for (int i = 0; i < limbs; i++) {
 8737       __ ldr(b, Address(bLimbs, i * 8));
 8738       for (int j = 0; j < limbs; j++) {
 8739         __ ldr(a, Address(aLimbs, j * 8));
 8740         __ smulh(high, a, b);
 8741         __ mul(low, a, b);
 8742         __ extr(high, high, low, bpl);
 8743         __ andr(low, low,  mask);
 8744         __ add(c[i + j], c[i + j], low);
 8745         __ add(c[i + j + 1], c[i + j + 1], high);
 8746       }
 8747     }
 8748 
 8749     for (int i = 0; i < limbs; i++) {
 8750       __ mul(c[i + 5], c[i + 5], term);
 8751       __ add(c[i], c[i], c[i + 5]);
 8752     }
 8753 
 8754     // Carry-add with reduction from high limb
 8755     Register tmp       = low;
 8756     Register carry_add = high;
 8757     __ mov(carry_add, CARRY_ADD);
 8758 
 8759     // Limb 3
 8760     __ add(tmp, c[3], carry_add);
 8761     __ asr(tmp, tmp, bpl);
 8762     __ add(c[4], c[4], tmp);
 8763     __ lsl(tmp, tmp, bpl);
 8764     __ sub(c[3], c[3], tmp);
 8765 
 8766     // Limb 4
 8767     __ add(tmp, c[4], carry_add);
 8768     __ asr(tmp, tmp, bpl);
 8769 
 8770     // Reduce high order limb and fold back into low order limb
 8771     __ mul(term, tmp, term);
 8772     __ add(c[0], c[0], term);
 8773 
 8774     __ lsl(tmp, tmp, bpl);
 8775     __ sub(c[4], c[4], tmp);
 8776 
 8777     // Limbs 0 - 3
 8778     for (int i = 0; i < (limbs - 1); i++) {
 8779       __ add(tmp, c[i], carry_add);
 8780       __ asr(tmp, tmp, bpl);
 8781       __ add(c[i + 1], c[i + 1], tmp);
 8782       __ lsl(tmp, tmp, bpl);
 8783       __ sub(c[i], c[i], tmp);
 8784     }
 8785 
 8786     for (int i = 0; i < limbs; i++) {
 8787       __ str(c[i], Address(rLimbs, i * 8));
 8788     }
 8789 
 8790     __ mov(r0, 0);
 8791     __ leave();   // required for proper stackwalking of RuntimeStub frame
 8792     __ ret(lr);
 8793 
 8794     // record the stub entry and end
 8795     store_archive_data(stub_id, start, __ pc());
 8796 
 8797     return start;
 8798   }
 8799 
 8800   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 8801              Register tmp0, Register tmp1, Register tmp2) {
 8802     __ bic(tmp0, a2, a1); // for a0
 8803     __ bic(tmp1, a3, a2); // for a1
 8804     __ bic(tmp2, a4, a3); // for a2
 8805     __ eor(a2, a2, tmp2);
 8806     __ bic(tmp2, a0, a4); // for a3
 8807     __ eor(a3, a3, tmp2);
 8808     __ bic(tmp2, a1, a0); // for a4
 8809     __ eor(a0, a0, tmp0);
 8810     __ eor(a1, a1, tmp1);
 8811     __ eor(a4, a4, tmp2);
 8812   }
 8813 
 8814   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 8815                         Register a0, Register a1, Register a2, Register a3, Register a4,
 8816                         Register a5, Register a6, Register a7, Register a8, Register a9,
 8817                         Register a10, Register a11, Register a12, Register a13, Register a14,
 8818                         Register a15, Register a16, Register a17, Register a18, Register a19,
 8819                         Register a20, Register a21, Register a22, Register a23, Register a24,
 8820                         Register tmp0, Register tmp1, Register tmp2) {
 8821     __ eor3(tmp1, a4, a9, a14);
 8822     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 8823     __ eor3(tmp2, a1, a6, a11);
 8824     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 8825     __ rax1(tmp2, tmp0, tmp1); // d0
 8826     {
 8827 
 8828       Register tmp3, tmp4;
 8829       if (can_use_fp && can_use_r18) {
 8830         tmp3 = rfp;
 8831         tmp4 = r18_tls;
 8832       } else {
 8833         tmp3 = a4;
 8834         tmp4 = a9;
 8835         __ stp(tmp3, tmp4, __ pre(sp, -16));
 8836       }
 8837 
 8838       __ eor3(tmp3, a0, a5, a10);
 8839       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 8840       __ eor(a0, a0, tmp2);
 8841       __ eor(a5, a5, tmp2);
 8842       __ eor(a10, a10, tmp2);
 8843       __ eor(a15, a15, tmp2);
 8844       __ eor(a20, a20, tmp2); // d0(tmp2)
 8845       __ eor3(tmp3, a2, a7, a12);
 8846       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 8847       __ rax1(tmp3, tmp4, tmp2); // d1
 8848       __ eor(a1, a1, tmp3);
 8849       __ eor(a6, a6, tmp3);
 8850       __ eor(a11, a11, tmp3);
 8851       __ eor(a16, a16, tmp3);
 8852       __ eor(a21, a21, tmp3); // d1(tmp3)
 8853       __ rax1(tmp3, tmp2, tmp0); // d3
 8854       __ eor3(tmp2, a3, a8, a13);
 8855       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 8856       __ eor(a3, a3, tmp3);
 8857       __ eor(a8, a8, tmp3);
 8858       __ eor(a13, a13, tmp3);
 8859       __ eor(a18, a18, tmp3);
 8860       __ eor(a23, a23, tmp3);
 8861       __ rax1(tmp2, tmp1, tmp0); // d2
 8862       __ eor(a2, a2, tmp2);
 8863       __ eor(a7, a7, tmp2);
 8864       __ eor(a12, a12, tmp2);
 8865       __ rax1(tmp0, tmp0, tmp4); // d4
 8866       if (!can_use_fp || !can_use_r18) {
 8867         __ ldp(tmp3, tmp4, __ post(sp, 16));
 8868       }
 8869       __ eor(a17, a17, tmp2);
 8870       __ eor(a22, a22, tmp2);
 8871       __ eor(a4, a4, tmp0);
 8872       __ eor(a9, a9, tmp0);
 8873       __ eor(a14, a14, tmp0);
 8874       __ eor(a19, a19, tmp0);
 8875       __ eor(a24, a24, tmp0);
 8876     }
 8877 
 8878     __ rol(tmp0, a10, 3);
 8879     __ rol(a10, a1, 1);
 8880     __ rol(a1, a6, 44);
 8881     __ rol(a6, a9, 20);
 8882     __ rol(a9, a22, 61);
 8883     __ rol(a22, a14, 39);
 8884     __ rol(a14, a20, 18);
 8885     __ rol(a20, a2, 62);
 8886     __ rol(a2, a12, 43);
 8887     __ rol(a12, a13, 25);
 8888     __ rol(a13, a19, 8) ;
 8889     __ rol(a19, a23, 56);
 8890     __ rol(a23, a15, 41);
 8891     __ rol(a15, a4, 27);
 8892     __ rol(a4, a24, 14);
 8893     __ rol(a24, a21, 2);
 8894     __ rol(a21, a8, 55);
 8895     __ rol(a8, a16, 45);
 8896     __ rol(a16, a5, 36);
 8897     __ rol(a5, a3, 28);
 8898     __ rol(a3, a18, 21);
 8899     __ rol(a18, a17, 15);
 8900     __ rol(a17, a11, 10);
 8901     __ rol(a11, a7, 6);
 8902     __ mov(a7, tmp0);
 8903 
 8904     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 8905     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 8906     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 8907     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 8908     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 8909 
 8910     __ ldr(tmp1, __ post(rc, 8));
 8911     __ eor(a0, a0, tmp1);
 8912 
 8913   }
 8914 
 8915   // Arguments:
 8916   //
 8917   // Inputs:
 8918   //   c_rarg0   - byte[]  source+offset
 8919   //   c_rarg1   - byte[]  SHA.state
 8920   //   c_rarg2   - int     block_size
 8921   //   c_rarg3   - int     offset
 8922   //   c_rarg4   - int     limit
 8923   //
 8924   address generate_sha3_implCompress_gpr(StubId stub_id) {
 8925     bool multi_block;
 8926     switch (stub_id) {
 8927     case StubId::stubgen_sha3_implCompress_id:
 8928       multi_block = false;
 8929       break;
 8930     case StubId::stubgen_sha3_implCompressMB_id:
 8931       multi_block = true;
 8932       break;
 8933     default:
 8934       ShouldNotReachHere();
 8935     }
 8936     int entry_count = StubInfo::entry_count(stub_id);
 8937     assert(entry_count == 1, "sanity check");
 8938     address start = load_archive_data(stub_id);
 8939     if (start != nullptr) {
 8940       return start;
 8941     }
 8942     __ align(CodeEntryAlignment);
 8943     StubCodeMark mark(this, stub_id);
 8944     start = __ pc();
 8945 
 8946     Register buf           = c_rarg0;
 8947     Register state         = c_rarg1;
 8948     Register block_size    = c_rarg2;
 8949     Register ofs           = c_rarg3;
 8950     Register limit         = c_rarg4;
 8951 
 8952     // use r3.r17,r19..r28 to keep a0..a24.
 8953     // a0..a24 are respective locals from SHA3.java
 8954     Register a0 = r25,
 8955              a1 = r26,
 8956              a2 = r27,
 8957              a3 = r3,
 8958              a4 = r4,
 8959              a5 = r5,
 8960              a6 = r6,
 8961              a7 = r7,
 8962              a8 = rscratch1, // r8
 8963              a9 = rscratch2, // r9
 8964              a10 = r10,
 8965              a11 = r11,
 8966              a12 = r12,
 8967              a13 = r13,
 8968              a14 = r14,
 8969              a15 = r15,
 8970              a16 = r16,
 8971              a17 = r17,
 8972              a18 = r28,
 8973              a19 = r19,
 8974              a20 = r20,
 8975              a21 = r21,
 8976              a22 = r22,
 8977              a23 = r23,
 8978              a24 = r24;
 8979 
 8980     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 8981 
 8982     Label sha3_loop, rounds24_preloop, loop_body;
 8983     Label sha3_512_or_sha3_384, shake128;
 8984 
 8985     bool can_use_r18 = false;
 8986 #ifndef R18_RESERVED
 8987     can_use_r18 = true;
 8988 #endif
 8989     bool can_use_fp = !PreserveFramePointer;
 8990 
 8991     __ enter();
 8992 
 8993     // save almost all yet unsaved gpr registers on stack
 8994     __ str(block_size, __ pre(sp, -128));
 8995     if (multi_block) {
 8996       __ stpw(ofs, limit, Address(sp, 8));
 8997     }
 8998     // 8 bytes at sp+16 will be used to keep buf
 8999     __ stp(r19, r20, Address(sp, 32));
 9000     __ stp(r21, r22, Address(sp, 48));
 9001     __ stp(r23, r24, Address(sp, 64));
 9002     __ stp(r25, r26, Address(sp, 80));
 9003     __ stp(r27, r28, Address(sp, 96));
 9004     if (can_use_r18 && can_use_fp) {
 9005       __ stp(r18_tls, state, Address(sp, 112));
 9006     } else {
 9007       __ str(state, Address(sp, 112));
 9008     }
 9009 
 9010     // begin sha3 calculations: loading a0..a24 from state arrary
 9011     __ ldp(a0, a1, state);
 9012     __ ldp(a2, a3, Address(state, 16));
 9013     __ ldp(a4, a5, Address(state, 32));
 9014     __ ldp(a6, a7, Address(state, 48));
 9015     __ ldp(a8, a9, Address(state, 64));
 9016     __ ldp(a10, a11, Address(state, 80));
 9017     __ ldp(a12, a13, Address(state, 96));
 9018     __ ldp(a14, a15, Address(state, 112));
 9019     __ ldp(a16, a17, Address(state, 128));
 9020     __ ldp(a18, a19, Address(state, 144));
 9021     __ ldp(a20, a21, Address(state, 160));
 9022     __ ldp(a22, a23, Address(state, 176));
 9023     __ ldr(a24, Address(state, 192));
 9024 
 9025     __ BIND(sha3_loop);
 9026 
 9027     // load input
 9028     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9029     __ eor(a0, a0, tmp3);
 9030     __ eor(a1, a1, tmp2);
 9031     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9032     __ eor(a2, a2, tmp3);
 9033     __ eor(a3, a3, tmp2);
 9034     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9035     __ eor(a4, a4, tmp3);
 9036     __ eor(a5, a5, tmp2);
 9037     __ ldr(tmp3, __ post(buf, 8));
 9038     __ eor(a6, a6, tmp3);
 9039 
 9040     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 9041     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 9042 
 9043     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9044     __ eor(a7, a7, tmp3);
 9045     __ eor(a8, a8, tmp2);
 9046     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9047     __ eor(a9, a9, tmp3);
 9048     __ eor(a10, a10, tmp2);
 9049     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9050     __ eor(a11, a11, tmp3);
 9051     __ eor(a12, a12, tmp2);
 9052     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9053     __ eor(a13, a13, tmp3);
 9054     __ eor(a14, a14, tmp2);
 9055     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9056     __ eor(a15, a15, tmp3);
 9057     __ eor(a16, a16, tmp2);
 9058 
 9059     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 9060     __ andw(tmp2, block_size, 48);
 9061     __ cbzw(tmp2, rounds24_preloop);
 9062     __ tbnz(block_size, 5, shake128);
 9063     // block_size == 144, bit5 == 0, SHA3-244
 9064     __ ldr(tmp3, __ post(buf, 8));
 9065     __ eor(a17, a17, tmp3);
 9066     __ b(rounds24_preloop);
 9067 
 9068     __ BIND(shake128);
 9069     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9070     __ eor(a17, a17, tmp3);
 9071     __ eor(a18, a18, tmp2);
 9072     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9073     __ eor(a19, a19, tmp3);
 9074     __ eor(a20, a20, tmp2);
 9075     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 9076 
 9077     __ BIND(sha3_512_or_sha3_384);
 9078     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9079     __ eor(a7, a7, tmp3);
 9080     __ eor(a8, a8, tmp2);
 9081     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 9082 
 9083     // SHA3-384
 9084     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9085     __ eor(a9, a9, tmp3);
 9086     __ eor(a10, a10, tmp2);
 9087     __ ldp(tmp3, tmp2, __ post(buf, 16));
 9088     __ eor(a11, a11, tmp3);
 9089     __ eor(a12, a12, tmp2);
 9090 
 9091     __ BIND(rounds24_preloop);
 9092     __ fmovs(v0, 24.0); // float loop counter,
 9093     __ fmovs(v1, 1.0);  // exact representation
 9094 
 9095     __ str(buf, Address(sp, 16));
 9096     __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
 9097 
 9098     __ BIND(loop_body);
 9099     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 9100                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 9101                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 9102                      tmp0, tmp1, tmp2);
 9103     __ fsubs(v0, v0, v1);
 9104     __ fcmps(v0, 0.0);
 9105     __ br(__ NE, loop_body);
 9106 
 9107     if (multi_block) {
 9108       __ ldrw(block_size, sp); // block_size
 9109       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 9110       __ addw(tmp2, tmp2, block_size);
 9111       __ cmpw(tmp2, tmp1);
 9112       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 9113       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 9114       __ br(Assembler::LE, sha3_loop);
 9115       __ movw(c_rarg0, tmp2); // return offset
 9116     }
 9117     if (can_use_fp && can_use_r18) {
 9118       __ ldp(r18_tls, state, Address(sp, 112));
 9119     } else {
 9120       __ ldr(state, Address(sp, 112));
 9121     }
 9122     // save calculated sha3 state
 9123     __ stp(a0, a1, Address(state));
 9124     __ stp(a2, a3, Address(state, 16));
 9125     __ stp(a4, a5, Address(state, 32));
 9126     __ stp(a6, a7, Address(state, 48));
 9127     __ stp(a8, a9, Address(state, 64));
 9128     __ stp(a10, a11, Address(state, 80));
 9129     __ stp(a12, a13, Address(state, 96));
 9130     __ stp(a14, a15, Address(state, 112));
 9131     __ stp(a16, a17, Address(state, 128));
 9132     __ stp(a18, a19, Address(state, 144));
 9133     __ stp(a20, a21, Address(state, 160));
 9134     __ stp(a22, a23, Address(state, 176));
 9135     __ str(a24, Address(state, 192));
 9136 
 9137     // restore required registers from stack
 9138     __ ldp(r19, r20, Address(sp, 32));
 9139     __ ldp(r21, r22, Address(sp, 48));
 9140     __ ldp(r23, r24, Address(sp, 64));
 9141     __ ldp(r25, r26, Address(sp, 80));
 9142     __ ldp(r27, r28, Address(sp, 96));
 9143     if (can_use_fp && can_use_r18) {
 9144       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 9145     } // else no need to recalculate rfp, since it wasn't changed
 9146 
 9147     __ leave();
 9148 
 9149     __ ret(lr);
 9150 
 9151     // record the stub entry and end
 9152     store_archive_data(stub_id, start, __ pc());
 9153 
 9154     return start;
 9155   }
 9156 
 9157   /**
 9158    *  Arguments:
 9159    *
 9160    * Inputs:
 9161    *   c_rarg0   - int crc
 9162    *   c_rarg1   - byte* buf
 9163    *   c_rarg2   - int length
 9164    *
 9165    * Output:
 9166    *       rax   - int crc result
 9167    */
 9168   address generate_updateBytesCRC32() {
 9169     assert(UseCRC32Intrinsics, "what are we doing here?");
 9170     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 9171     int entry_count = StubInfo::entry_count(stub_id);
 9172     assert(entry_count == 1, "sanity check");
 9173     address start = load_archive_data(stub_id);
 9174     if (start != nullptr) {
 9175       return start;
 9176     }
 9177     __ align(CodeEntryAlignment);
 9178     StubCodeMark mark(this, stub_id);
 9179 
 9180     start = __ pc();
 9181 
 9182     const Register crc   = c_rarg0;  // crc
 9183     const Register buf   = c_rarg1;  // source java byte array address
 9184     const Register len   = c_rarg2;  // length
 9185     const Register table0 = c_rarg3; // crc_table address
 9186     const Register table1 = c_rarg4;
 9187     const Register table2 = c_rarg5;
 9188     const Register table3 = c_rarg6;
 9189     const Register tmp3 = c_rarg7;
 9190 
 9191     BLOCK_COMMENT("Entry:");
 9192     __ enter(); // required for proper stackwalking of RuntimeStub frame
 9193 
 9194     __ kernel_crc32(crc, buf, len,
 9195               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 9196 
 9197     __ leave(); // required for proper stackwalking of RuntimeStub frame
 9198     __ ret(lr);
 9199 
 9200     // record the stub entry and end
 9201     store_archive_data(stub_id, start, __ pc());
 9202 
 9203     return start;
 9204   }
 9205 
 9206   /**
 9207    *  Arguments:
 9208    *
 9209    * Inputs:
 9210    *   c_rarg0   - int crc
 9211    *   c_rarg1   - byte* buf
 9212    *   c_rarg2   - int length
 9213    *   c_rarg3   - int* table
 9214    *
 9215    * Output:
 9216    *       r0   - int crc result
 9217    */
 9218   address generate_updateBytesCRC32C() {
 9219     assert(UseCRC32CIntrinsics, "what are we doing here?");
 9220     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 9221     int entry_count = StubInfo::entry_count(stub_id);
 9222     assert(entry_count == 1, "sanity check");
 9223     address start = load_archive_data(stub_id);
 9224     if (start != nullptr) {
 9225       return start;
 9226     }
 9227     __ align(CodeEntryAlignment);
 9228     StubCodeMark mark(this, stub_id);
 9229 
 9230     start = __ pc();
 9231 
 9232     const Register crc   = c_rarg0;  // crc
 9233     const Register buf   = c_rarg1;  // source java byte array address
 9234     const Register len   = c_rarg2;  // length
 9235     const Register table0 = c_rarg3; // crc_table address
 9236     const Register table1 = c_rarg4;
 9237     const Register table2 = c_rarg5;
 9238     const Register table3 = c_rarg6;
 9239     const Register tmp3 = c_rarg7;
 9240 
 9241     BLOCK_COMMENT("Entry:");
 9242     __ enter(); // required for proper stackwalking of RuntimeStub frame
 9243 
 9244     __ kernel_crc32c(crc, buf, len,
 9245               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 9246 
 9247     __ leave(); // required for proper stackwalking of RuntimeStub frame
 9248     __ ret(lr);
 9249 
 9250     // record the stub entry and end
 9251     store_archive_data(stub_id, start, __ pc());
 9252 
 9253     return start;
 9254   }
 9255 
 9256   /***
 9257    *  Arguments:
 9258    *
 9259    *  Inputs:
 9260    *   c_rarg0   - int   adler
 9261    *   c_rarg1   - byte* buff
 9262    *   c_rarg2   - int   len
 9263    *
 9264    * Output:
 9265    *   c_rarg0   - int adler result
 9266    */
 9267   address generate_updateBytesAdler32() {
 9268     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 9269     int entry_count = StubInfo::entry_count(stub_id);
 9270     assert(entry_count == 1, "sanity check");
 9271     address start = load_archive_data(stub_id);
 9272     if (start != nullptr) {
 9273       return start;
 9274     }
 9275     __ align(CodeEntryAlignment);
 9276     StubCodeMark mark(this, stub_id);
 9277     start = __ pc();
 9278 
 9279     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 9280 
 9281     // Aliases
 9282     Register adler  = c_rarg0;
 9283     Register s1     = c_rarg0;
 9284     Register s2     = c_rarg3;
 9285     Register buff   = c_rarg1;
 9286     Register len    = c_rarg2;
 9287     Register nmax  = r4;
 9288     Register base  = r5;
 9289     Register count = r6;
 9290     Register temp0 = rscratch1;
 9291     Register temp1 = rscratch2;
 9292     FloatRegister vbytes = v0;
 9293     FloatRegister vs1acc = v1;
 9294     FloatRegister vs2acc = v2;
 9295     FloatRegister vtable = v3;
 9296 
 9297     // Max number of bytes we can process before having to take the mod
 9298     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 9299     uint64_t BASE = 0xfff1;
 9300     uint64_t NMAX = 0x15B0;
 9301 
 9302     __ mov(base, BASE);
 9303     __ mov(nmax, NMAX);
 9304 
 9305     // Load accumulation coefficients for the upper 16 bits
 9306     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 9307     __ ld1(vtable, __ T16B, Address(temp0));
 9308 
 9309     // s1 is initialized to the lower 16 bits of adler
 9310     // s2 is initialized to the upper 16 bits of adler
 9311     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 9312     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 9313 
 9314     // The pipelined loop needs at least 16 elements for 1 iteration
 9315     // It does check this, but it is more effective to skip to the cleanup loop
 9316     __ cmp(len, (u1)16);
 9317     __ br(Assembler::HS, L_nmax);
 9318     __ cbz(len, L_combine);
 9319 
 9320     __ bind(L_simple_by1_loop);
 9321     __ ldrb(temp0, Address(__ post(buff, 1)));
 9322     __ add(s1, s1, temp0);
 9323     __ add(s2, s2, s1);
 9324     __ subs(len, len, 1);
 9325     __ br(Assembler::HI, L_simple_by1_loop);
 9326 
 9327     // s1 = s1 % BASE
 9328     __ subs(temp0, s1, base);
 9329     __ csel(s1, temp0, s1, Assembler::HS);
 9330 
 9331     // s2 = s2 % BASE
 9332     __ lsr(temp0, s2, 16);
 9333     __ lsl(temp1, temp0, 4);
 9334     __ sub(temp1, temp1, temp0);
 9335     __ add(s2, temp1, s2, ext::uxth);
 9336 
 9337     __ subs(temp0, s2, base);
 9338     __ csel(s2, temp0, s2, Assembler::HS);
 9339 
 9340     __ b(L_combine);
 9341 
 9342     __ bind(L_nmax);
 9343     __ subs(len, len, nmax);
 9344     __ sub(count, nmax, 16);
 9345     __ br(Assembler::LO, L_by16);
 9346 
 9347     __ bind(L_nmax_loop);
 9348 
 9349     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 9350                                       vbytes, vs1acc, vs2acc, vtable);
 9351 
 9352     __ subs(count, count, 16);
 9353     __ br(Assembler::HS, L_nmax_loop);
 9354 
 9355     // s1 = s1 % BASE
 9356     __ lsr(temp0, s1, 16);
 9357     __ lsl(temp1, temp0, 4);
 9358     __ sub(temp1, temp1, temp0);
 9359     __ add(temp1, temp1, s1, ext::uxth);
 9360 
 9361     __ lsr(temp0, temp1, 16);
 9362     __ lsl(s1, temp0, 4);
 9363     __ sub(s1, s1, temp0);
 9364     __ add(s1, s1, temp1, ext:: uxth);
 9365 
 9366     __ subs(temp0, s1, base);
 9367     __ csel(s1, temp0, s1, Assembler::HS);
 9368 
 9369     // s2 = s2 % BASE
 9370     __ lsr(temp0, s2, 16);
 9371     __ lsl(temp1, temp0, 4);
 9372     __ sub(temp1, temp1, temp0);
 9373     __ add(temp1, temp1, s2, ext::uxth);
 9374 
 9375     __ lsr(temp0, temp1, 16);
 9376     __ lsl(s2, temp0, 4);
 9377     __ sub(s2, s2, temp0);
 9378     __ add(s2, s2, temp1, ext:: uxth);
 9379 
 9380     __ subs(temp0, s2, base);
 9381     __ csel(s2, temp0, s2, Assembler::HS);
 9382 
 9383     __ subs(len, len, nmax);
 9384     __ sub(count, nmax, 16);
 9385     __ br(Assembler::HS, L_nmax_loop);
 9386 
 9387     __ bind(L_by16);
 9388     __ adds(len, len, count);
 9389     __ br(Assembler::LO, L_by1);
 9390 
 9391     __ bind(L_by16_loop);
 9392 
 9393     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 9394                                       vbytes, vs1acc, vs2acc, vtable);
 9395 
 9396     __ subs(len, len, 16);
 9397     __ br(Assembler::HS, L_by16_loop);
 9398 
 9399     __ bind(L_by1);
 9400     __ adds(len, len, 15);
 9401     __ br(Assembler::LO, L_do_mod);
 9402 
 9403     __ bind(L_by1_loop);
 9404     __ ldrb(temp0, Address(__ post(buff, 1)));
 9405     __ add(s1, temp0, s1);
 9406     __ add(s2, s2, s1);
 9407     __ subs(len, len, 1);
 9408     __ br(Assembler::HS, L_by1_loop);
 9409 
 9410     __ bind(L_do_mod);
 9411     // s1 = s1 % BASE
 9412     __ lsr(temp0, s1, 16);
 9413     __ lsl(temp1, temp0, 4);
 9414     __ sub(temp1, temp1, temp0);
 9415     __ add(temp1, temp1, s1, ext::uxth);
 9416 
 9417     __ lsr(temp0, temp1, 16);
 9418     __ lsl(s1, temp0, 4);
 9419     __ sub(s1, s1, temp0);
 9420     __ add(s1, s1, temp1, ext:: uxth);
 9421 
 9422     __ subs(temp0, s1, base);
 9423     __ csel(s1, temp0, s1, Assembler::HS);
 9424 
 9425     // s2 = s2 % BASE
 9426     __ lsr(temp0, s2, 16);
 9427     __ lsl(temp1, temp0, 4);
 9428     __ sub(temp1, temp1, temp0);
 9429     __ add(temp1, temp1, s2, ext::uxth);
 9430 
 9431     __ lsr(temp0, temp1, 16);
 9432     __ lsl(s2, temp0, 4);
 9433     __ sub(s2, s2, temp0);
 9434     __ add(s2, s2, temp1, ext:: uxth);
 9435 
 9436     __ subs(temp0, s2, base);
 9437     __ csel(s2, temp0, s2, Assembler::HS);
 9438 
 9439     // Combine lower bits and higher bits
 9440     __ bind(L_combine);
 9441     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 9442 
 9443     __ ret(lr);
 9444 
 9445     // record the stub entry and end
 9446     store_archive_data(stub_id, start, __ pc());
 9447 
 9448     return start;
 9449   }
 9450 
 9451   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 9452           Register temp0, Register temp1, FloatRegister vbytes,
 9453           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 9454     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 9455     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 9456     // In non-vectorized code, we update s1 and s2 as:
 9457     //   s1 <- s1 + b1
 9458     //   s2 <- s2 + s1
 9459     //   s1 <- s1 + b2
 9460     //   s2 <- s2 + b1
 9461     //   ...
 9462     //   s1 <- s1 + b16
 9463     //   s2 <- s2 + s1
 9464     // Putting above assignments together, we have:
 9465     //   s1_new = s1 + b1 + b2 + ... + b16
 9466     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 9467     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 9468     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 9469     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 9470 
 9471     // s2 = s2 + s1 * 16
 9472     __ add(s2, s2, s1, Assembler::LSL, 4);
 9473 
 9474     // vs1acc = b1 + b2 + b3 + ... + b16
 9475     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 9476     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 9477     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 9478     __ uaddlv(vs1acc, __ T16B, vbytes);
 9479     __ uaddlv(vs2acc, __ T8H, vs2acc);
 9480 
 9481     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 9482     __ fmovd(temp0, vs1acc);
 9483     __ fmovd(temp1, vs2acc);
 9484     __ add(s1, s1, temp0);
 9485     __ add(s2, s2, temp1);
 9486   }
 9487 
 9488   /**
 9489    *  Arguments:
 9490    *
 9491    *  Input:
 9492    *    c_rarg0   - x address
 9493    *    c_rarg1   - x length
 9494    *    c_rarg2   - y address
 9495    *    c_rarg3   - y length
 9496    *    c_rarg4   - z address
 9497    */
 9498   address generate_multiplyToLen() {
 9499     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 9500     int entry_count = StubInfo::entry_count(stub_id);
 9501     assert(entry_count == 1, "sanity check");
 9502     address start = load_archive_data(stub_id);
 9503     if (start != nullptr) {
 9504       return start;
 9505     }
 9506     __ align(CodeEntryAlignment);
 9507     StubCodeMark mark(this, stub_id);
 9508 
 9509     start = __ pc();
 9510     const Register x     = r0;
 9511     const Register xlen  = r1;
 9512     const Register y     = r2;
 9513     const Register ylen  = r3;
 9514     const Register z     = r4;
 9515 
 9516     const Register tmp0  = r5;
 9517     const Register tmp1  = r10;
 9518     const Register tmp2  = r11;
 9519     const Register tmp3  = r12;
 9520     const Register tmp4  = r13;
 9521     const Register tmp5  = r14;
 9522     const Register tmp6  = r15;
 9523     const Register tmp7  = r16;
 9524 
 9525     BLOCK_COMMENT("Entry:");
 9526     __ enter(); // required for proper stackwalking of RuntimeStub frame
 9527     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 9528     __ leave(); // required for proper stackwalking of RuntimeStub frame
 9529     __ ret(lr);
 9530 
 9531     // record the stub entry and end
 9532     store_archive_data(stub_id, start, __ pc());
 9533 
 9534     return start;
 9535   }
 9536 
 9537   address generate_squareToLen() {
 9538     // squareToLen algorithm for sizes 1..127 described in java code works
 9539     // faster than multiply_to_len on some CPUs and slower on others, but
 9540     // multiply_to_len shows a bit better overall results
 9541     StubId stub_id = StubId::stubgen_squareToLen_id;
 9542     int entry_count = StubInfo::entry_count(stub_id);
 9543     assert(entry_count == 1, "sanity check");
 9544     address start = load_archive_data(stub_id);
 9545     if (start != nullptr) {
 9546       return start;
 9547     }
 9548     __ align(CodeEntryAlignment);
 9549     StubCodeMark mark(this, stub_id);
 9550     start = __ pc();
 9551 
 9552     const Register x     = r0;
 9553     const Register xlen  = r1;
 9554     const Register z     = r2;
 9555     const Register y     = r4; // == x
 9556     const Register ylen  = r5; // == xlen
 9557 
 9558     const Register tmp0  = r3;
 9559     const Register tmp1  = r10;
 9560     const Register tmp2  = r11;
 9561     const Register tmp3  = r12;
 9562     const Register tmp4  = r13;
 9563     const Register tmp5  = r14;
 9564     const Register tmp6  = r15;
 9565     const Register tmp7  = r16;
 9566 
 9567     RegSet spilled_regs = RegSet::of(y, ylen);
 9568     BLOCK_COMMENT("Entry:");
 9569     __ enter();
 9570     __ push(spilled_regs, sp);
 9571     __ mov(y, x);
 9572     __ mov(ylen, xlen);
 9573     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 9574     __ pop(spilled_regs, sp);
 9575     __ leave();
 9576     __ ret(lr);
 9577 
 9578     // record the stub entry and end
 9579     store_archive_data(stub_id, start, __ pc());
 9580 
 9581     return start;
 9582   }
 9583 
 9584   address generate_mulAdd() {
 9585     StubId stub_id = StubId::stubgen_mulAdd_id;
 9586     int entry_count = StubInfo::entry_count(stub_id);
 9587     assert(entry_count == 1, "sanity check");
 9588     address start = load_archive_data(stub_id);
 9589     if (start != nullptr) {
 9590       return start;
 9591     }
 9592     __ align(CodeEntryAlignment);
 9593     StubCodeMark mark(this, stub_id);
 9594 
 9595     start = __ pc();
 9596 
 9597     const Register out     = r0;
 9598     const Register in      = r1;
 9599     const Register offset  = r2;
 9600     const Register len     = r3;
 9601     const Register k       = r4;
 9602 
 9603     BLOCK_COMMENT("Entry:");
 9604     __ enter();
 9605     __ mul_add(out, in, offset, len, k);
 9606     __ leave();
 9607     __ ret(lr);
 9608 
 9609     // record the stub entry and end
 9610     store_archive_data(stub_id, start, __ pc());
 9611 
 9612     return start;
 9613   }
 9614 
 9615   // Arguments:
 9616   //
 9617   // Input:
 9618   //   c_rarg0   - newArr address
 9619   //   c_rarg1   - oldArr address
 9620   //   c_rarg2   - newIdx
 9621   //   c_rarg3   - shiftCount
 9622   //   c_rarg4   - numIter
 9623   //
 9624   address generate_bigIntegerRightShift() {
 9625     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 9626     int entry_count = StubInfo::entry_count(stub_id);
 9627     assert(entry_count == 1, "sanity check");
 9628     address start = load_archive_data(stub_id);
 9629     if (start != nullptr) {
 9630       return start;
 9631     }
 9632     __ align(CodeEntryAlignment);
 9633     StubCodeMark mark(this, stub_id);
 9634     start = __ pc();
 9635 
 9636     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 9637 
 9638     Register newArr        = c_rarg0;
 9639     Register oldArr        = c_rarg1;
 9640     Register newIdx        = c_rarg2;
 9641     Register shiftCount    = c_rarg3;
 9642     Register numIter       = c_rarg4;
 9643     Register idx           = numIter;
 9644 
 9645     Register newArrCur     = rscratch1;
 9646     Register shiftRevCount = rscratch2;
 9647     Register oldArrCur     = r13;
 9648     Register oldArrNext    = r14;
 9649 
 9650     FloatRegister oldElem0        = v0;
 9651     FloatRegister oldElem1        = v1;
 9652     FloatRegister newElem         = v2;
 9653     FloatRegister shiftVCount     = v3;
 9654     FloatRegister shiftVRevCount  = v4;
 9655 
 9656     __ cbz(idx, Exit);
 9657 
 9658     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 9659 
 9660     // left shift count
 9661     __ movw(shiftRevCount, 32);
 9662     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 9663 
 9664     // numIter too small to allow a 4-words SIMD loop, rolling back
 9665     __ cmp(numIter, (u1)4);
 9666     __ br(Assembler::LT, ShiftThree);
 9667 
 9668     __ dup(shiftVCount,    __ T4S, shiftCount);
 9669     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 9670     __ negr(shiftVCount,   __ T4S, shiftVCount);
 9671 
 9672     __ BIND(ShiftSIMDLoop);
 9673 
 9674     // Calculate the load addresses
 9675     __ sub(idx, idx, 4);
 9676     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 9677     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 9678     __ add(oldArrCur,  oldArrNext, 4);
 9679 
 9680     // Load 4 words and process
 9681     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 9682     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 9683     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 9684     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 9685     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 9686     __ st1(newElem,   __ T4S,  Address(newArrCur));
 9687 
 9688     __ cmp(idx, (u1)4);
 9689     __ br(Assembler::LT, ShiftTwoLoop);
 9690     __ b(ShiftSIMDLoop);
 9691 
 9692     __ BIND(ShiftTwoLoop);
 9693     __ cbz(idx, Exit);
 9694     __ cmp(idx, (u1)1);
 9695     __ br(Assembler::EQ, ShiftOne);
 9696 
 9697     // Calculate the load addresses
 9698     __ sub(idx, idx, 2);
 9699     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 9700     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 9701     __ add(oldArrCur,  oldArrNext, 4);
 9702 
 9703     // Load 2 words and process
 9704     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 9705     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 9706     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 9707     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 9708     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 9709     __ st1(newElem,   __ T2S, Address(newArrCur));
 9710     __ b(ShiftTwoLoop);
 9711 
 9712     __ BIND(ShiftThree);
 9713     __ tbz(idx, 1, ShiftOne);
 9714     __ tbz(idx, 0, ShiftTwo);
 9715     __ ldrw(r10,  Address(oldArr, 12));
 9716     __ ldrw(r11,  Address(oldArr, 8));
 9717     __ lsrvw(r10, r10, shiftCount);
 9718     __ lslvw(r11, r11, shiftRevCount);
 9719     __ orrw(r12,  r10, r11);
 9720     __ strw(r12,  Address(newArr, 8));
 9721 
 9722     __ BIND(ShiftTwo);
 9723     __ ldrw(r10,  Address(oldArr, 8));
 9724     __ ldrw(r11,  Address(oldArr, 4));
 9725     __ lsrvw(r10, r10, shiftCount);
 9726     __ lslvw(r11, r11, shiftRevCount);
 9727     __ orrw(r12,  r10, r11);
 9728     __ strw(r12,  Address(newArr, 4));
 9729 
 9730     __ BIND(ShiftOne);
 9731     __ ldrw(r10,  Address(oldArr, 4));
 9732     __ ldrw(r11,  Address(oldArr));
 9733     __ lsrvw(r10, r10, shiftCount);
 9734     __ lslvw(r11, r11, shiftRevCount);
 9735     __ orrw(r12,  r10, r11);
 9736     __ strw(r12,  Address(newArr));
 9737 
 9738     __ BIND(Exit);
 9739     __ ret(lr);
 9740 
 9741     // record the stub entry and end
 9742     store_archive_data(stub_id, start, __ pc());
 9743 
 9744     return start;
 9745   }
 9746 
 9747   // Arguments:
 9748   //
 9749   // Input:
 9750   //   c_rarg0   - newArr address
 9751   //   c_rarg1   - oldArr address
 9752   //   c_rarg2   - newIdx
 9753   //   c_rarg3   - shiftCount
 9754   //   c_rarg4   - numIter
 9755   //
 9756   address generate_bigIntegerLeftShift() {
 9757     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 9758     int entry_count = StubInfo::entry_count(stub_id);
 9759     assert(entry_count == 1, "sanity check");
 9760     address start = load_archive_data(stub_id);
 9761     if (start != nullptr) {
 9762       return start;
 9763     }
 9764     __ align(CodeEntryAlignment);
 9765     StubCodeMark mark(this, stub_id);
 9766     start = __ pc();
 9767 
 9768     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 9769 
 9770     Register newArr        = c_rarg0;
 9771     Register oldArr        = c_rarg1;
 9772     Register newIdx        = c_rarg2;
 9773     Register shiftCount    = c_rarg3;
 9774     Register numIter       = c_rarg4;
 9775 
 9776     Register shiftRevCount = rscratch1;
 9777     Register oldArrNext    = rscratch2;
 9778 
 9779     FloatRegister oldElem0        = v0;
 9780     FloatRegister oldElem1        = v1;
 9781     FloatRegister newElem         = v2;
 9782     FloatRegister shiftVCount     = v3;
 9783     FloatRegister shiftVRevCount  = v4;
 9784 
 9785     __ cbz(numIter, Exit);
 9786 
 9787     __ add(oldArrNext, oldArr, 4);
 9788     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 9789 
 9790     // right shift count
 9791     __ movw(shiftRevCount, 32);
 9792     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 9793 
 9794     // numIter too small to allow a 4-words SIMD loop, rolling back
 9795     __ cmp(numIter, (u1)4);
 9796     __ br(Assembler::LT, ShiftThree);
 9797 
 9798     __ dup(shiftVCount,     __ T4S, shiftCount);
 9799     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 9800     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 9801 
 9802     __ BIND(ShiftSIMDLoop);
 9803 
 9804     // load 4 words and process
 9805     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 9806     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 9807     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 9808     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 9809     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 9810     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 9811     __ sub(numIter,   numIter, 4);
 9812 
 9813     __ cmp(numIter, (u1)4);
 9814     __ br(Assembler::LT, ShiftTwoLoop);
 9815     __ b(ShiftSIMDLoop);
 9816 
 9817     __ BIND(ShiftTwoLoop);
 9818     __ cbz(numIter, Exit);
 9819     __ cmp(numIter, (u1)1);
 9820     __ br(Assembler::EQ, ShiftOne);
 9821 
 9822     // load 2 words and process
 9823     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 9824     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 9825     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 9826     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 9827     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 9828     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 9829     __ sub(numIter,   numIter, 2);
 9830     __ b(ShiftTwoLoop);
 9831 
 9832     __ BIND(ShiftThree);
 9833     __ ldrw(r10,  __ post(oldArr, 4));
 9834     __ ldrw(r11,  __ post(oldArrNext, 4));
 9835     __ lslvw(r10, r10, shiftCount);
 9836     __ lsrvw(r11, r11, shiftRevCount);
 9837     __ orrw(r12,  r10, r11);
 9838     __ strw(r12,  __ post(newArr, 4));
 9839     __ tbz(numIter, 1, Exit);
 9840     __ tbz(numIter, 0, ShiftOne);
 9841 
 9842     __ BIND(ShiftTwo);
 9843     __ ldrw(r10,  __ post(oldArr, 4));
 9844     __ ldrw(r11,  __ post(oldArrNext, 4));
 9845     __ lslvw(r10, r10, shiftCount);
 9846     __ lsrvw(r11, r11, shiftRevCount);
 9847     __ orrw(r12,  r10, r11);
 9848     __ strw(r12,  __ post(newArr, 4));
 9849 
 9850     __ BIND(ShiftOne);
 9851     __ ldrw(r10,  Address(oldArr));
 9852     __ ldrw(r11,  Address(oldArrNext));
 9853     __ lslvw(r10, r10, shiftCount);
 9854     __ lsrvw(r11, r11, shiftRevCount);
 9855     __ orrw(r12,  r10, r11);
 9856     __ strw(r12,  Address(newArr));
 9857 
 9858     __ BIND(Exit);
 9859     __ ret(lr);
 9860 
 9861     // record the stub entry and end
 9862     store_archive_data(stub_id, start, __ pc());
 9863 
 9864     return start;
 9865   }
 9866 
 9867   address generate_count_positives(address &count_positives_long) {
 9868     StubId stub_id = StubId::stubgen_count_positives_id;
 9869     GrowableArray<address> entries;
 9870     int entry_count = StubInfo::entry_count(stub_id);
 9871     // We have an extra entry for count_positives_long.
 9872     assert(entry_count == 2, "sanity check");
 9873     address start = load_archive_data(stub_id, &entries);
 9874     if (start != nullptr) {
 9875       assert(entries.length() == 1,
 9876              "unexpected extra entry count %d", entries.length());
 9877       count_positives_long = entries.at(0);
 9878       return start;
 9879     }
 9880     const u1 large_loop_size = 64;
 9881     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 9882     int dcache_line = VM_Version::dcache_line_size();
 9883 
 9884     Register ary1 = r1, len = r2, result = r0;
 9885 
 9886     __ align(CodeEntryAlignment);
 9887     StubCodeMark mark(this, stub_id);
 9888 
 9889     address entry = __ pc();
 9890 
 9891     __ enter();
 9892     // precondition: a copy of len is already in result
 9893     // __ mov(result, len);
 9894 
 9895   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 9896         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 9897 
 9898   __ cmp(len, (u1)15);
 9899   __ br(Assembler::GT, LEN_OVER_15);
 9900   // The only case when execution falls into this code is when pointer is near
 9901   // the end of memory page and we have to avoid reading next page
 9902   __ add(ary1, ary1, len);
 9903   __ subs(len, len, 8);
 9904   __ br(Assembler::GT, LEN_OVER_8);
 9905   __ ldr(rscratch2, Address(ary1, -8));
 9906   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 9907   __ lsrv(rscratch2, rscratch2, rscratch1);
 9908   __ tst(rscratch2, UPPER_BIT_MASK);
 9909   __ csel(result, zr, result, Assembler::NE);
 9910   __ leave();
 9911   __ ret(lr);
 9912   __ bind(LEN_OVER_8);
 9913   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 9914   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 9915   __ tst(rscratch2, UPPER_BIT_MASK);
 9916   __ br(Assembler::NE, RET_NO_POP);
 9917   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 9918   __ lsrv(rscratch1, rscratch1, rscratch2);
 9919   __ tst(rscratch1, UPPER_BIT_MASK);
 9920   __ bind(RET_NO_POP);
 9921   __ csel(result, zr, result, Assembler::NE);
 9922   __ leave();
 9923   __ ret(lr);
 9924 
 9925   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 9926   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 9927 
 9928   count_positives_long = __ pc(); // 2nd entry point
 9929   entries.append(count_positives_long);
 9930 
 9931   __ enter();
 9932 
 9933   __ bind(LEN_OVER_15);
 9934     __ push(spilled_regs, sp);
 9935     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 9936     __ cbz(rscratch2, ALIGNED);
 9937     __ ldp(tmp6, tmp1, Address(ary1));
 9938     __ mov(tmp5, 16);
 9939     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 9940     __ add(ary1, ary1, rscratch1);
 9941     __ orr(tmp6, tmp6, tmp1);
 9942     __ tst(tmp6, UPPER_BIT_MASK);
 9943     __ br(Assembler::NE, RET_ADJUST);
 9944     __ sub(len, len, rscratch1);
 9945 
 9946   __ bind(ALIGNED);
 9947     __ cmp(len, large_loop_size);
 9948     __ br(Assembler::LT, CHECK_16);
 9949     // Perform 16-byte load as early return in pre-loop to handle situation
 9950     // when initially aligned large array has negative values at starting bytes,
 9951     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 9952     // slower. Cases with negative bytes further ahead won't be affected that
 9953     // much. In fact, it'll be faster due to early loads, less instructions and
 9954     // less branches in LARGE_LOOP.
 9955     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 9956     __ sub(len, len, 16);
 9957     __ orr(tmp6, tmp6, tmp1);
 9958     __ tst(tmp6, UPPER_BIT_MASK);
 9959     __ br(Assembler::NE, RET_ADJUST_16);
 9960     __ cmp(len, large_loop_size);
 9961     __ br(Assembler::LT, CHECK_16);
 9962 
 9963     if (SoftwarePrefetchHintDistance >= 0
 9964         && SoftwarePrefetchHintDistance >= dcache_line) {
 9965       // initial prefetch
 9966       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 9967     }
 9968   __ bind(LARGE_LOOP);
 9969     if (SoftwarePrefetchHintDistance >= 0) {
 9970       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 9971     }
 9972     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 9973     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 9974     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 9975     // instructions per cycle and have less branches, but this approach disables
 9976     // early return, thus, all 64 bytes are loaded and checked every time.
 9977     __ ldp(tmp2, tmp3, Address(ary1));
 9978     __ ldp(tmp4, tmp5, Address(ary1, 16));
 9979     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 9980     __ ldp(tmp6, tmp1, Address(ary1, 48));
 9981     __ add(ary1, ary1, large_loop_size);
 9982     __ sub(len, len, large_loop_size);
 9983     __ orr(tmp2, tmp2, tmp3);
 9984     __ orr(tmp4, tmp4, tmp5);
 9985     __ orr(rscratch1, rscratch1, rscratch2);
 9986     __ orr(tmp6, tmp6, tmp1);
 9987     __ orr(tmp2, tmp2, tmp4);
 9988     __ orr(rscratch1, rscratch1, tmp6);
 9989     __ orr(tmp2, tmp2, rscratch1);
 9990     __ tst(tmp2, UPPER_BIT_MASK);
 9991     __ br(Assembler::NE, RET_ADJUST_LONG);
 9992     __ cmp(len, large_loop_size);
 9993     __ br(Assembler::GE, LARGE_LOOP);
 9994 
 9995   __ bind(CHECK_16); // small 16-byte load pre-loop
 9996     __ cmp(len, (u1)16);
 9997     __ br(Assembler::LT, POST_LOOP16);
 9998 
 9999   __ bind(LOOP16); // small 16-byte load loop
10000     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
10001     __ sub(len, len, 16);
10002     __ orr(tmp2, tmp2, tmp3);
10003     __ tst(tmp2, UPPER_BIT_MASK);
10004     __ br(Assembler::NE, RET_ADJUST_16);
10005     __ cmp(len, (u1)16);
10006     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
10007 
10008   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
10009     __ cmp(len, (u1)8);
10010     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
10011     __ ldr(tmp3, Address(__ post(ary1, 8)));
10012     __ tst(tmp3, UPPER_BIT_MASK);
10013     __ br(Assembler::NE, RET_ADJUST);
10014     __ sub(len, len, 8);
10015 
10016   __ bind(POST_LOOP16_LOAD_TAIL);
10017     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
10018     __ ldr(tmp1, Address(ary1));
10019     __ mov(tmp2, 64);
10020     __ sub(tmp4, tmp2, len, __ LSL, 3);
10021     __ lslv(tmp1, tmp1, tmp4);
10022     __ tst(tmp1, UPPER_BIT_MASK);
10023     __ br(Assembler::NE, RET_ADJUST);
10024     // Fallthrough
10025 
10026   __ bind(RET_LEN);
10027     __ pop(spilled_regs, sp);
10028     __ leave();
10029     __ ret(lr);
10030 
10031     // difference result - len is the count of guaranteed to be
10032     // positive bytes
10033 
10034   __ bind(RET_ADJUST_LONG);
10035     __ add(len, len, (u1)(large_loop_size - 16));
10036   __ bind(RET_ADJUST_16);
10037     __ add(len, len, 16);
10038   __ bind(RET_ADJUST);
10039     __ pop(spilled_regs, sp);
10040     __ leave();
10041     __ sub(result, result, len);
10042     __ ret(lr);
10043 
10044     // record the stub entry and end plus the extra entry
10045     store_archive_data(stub_id, entry, __ pc(), &entries);
10046 
10047     return entry;
10048   }
10049 
10050   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
10051         bool usePrefetch, Label &NOT_EQUAL) {
10052     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10053         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
10054         tmp7 = r12, tmp8 = r13;
10055     Label LOOP;
10056 
10057     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10058     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10059     __ bind(LOOP);
10060     if (usePrefetch) {
10061       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
10062       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
10063     }
10064     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
10065     __ eor(tmp1, tmp1, tmp2);
10066     __ eor(tmp3, tmp3, tmp4);
10067     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
10068     __ orr(tmp1, tmp1, tmp3);
10069     __ cbnz(tmp1, NOT_EQUAL);
10070     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10071     __ eor(tmp5, tmp5, tmp6);
10072     __ eor(tmp7, tmp7, tmp8);
10073     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10074     __ orr(tmp5, tmp5, tmp7);
10075     __ cbnz(tmp5, NOT_EQUAL);
10076     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
10077     __ eor(tmp1, tmp1, tmp2);
10078     __ eor(tmp3, tmp3, tmp4);
10079     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
10080     __ orr(tmp1, tmp1, tmp3);
10081     __ cbnz(tmp1, NOT_EQUAL);
10082     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10083     __ eor(tmp5, tmp5, tmp6);
10084     __ sub(cnt1, cnt1, 8 * wordSize);
10085     __ eor(tmp7, tmp7, tmp8);
10086     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10087     // tmp6 is not used. MacroAssembler::subs is used here (rather than
10088     // cmp) because subs allows an unlimited range of immediate operand.
10089     __ subs(tmp6, cnt1, loopThreshold);
10090     __ orr(tmp5, tmp5, tmp7);
10091     __ cbnz(tmp5, NOT_EQUAL);
10092     __ br(__ GE, LOOP);
10093     // post-loop
10094     __ eor(tmp1, tmp1, tmp2);
10095     __ eor(tmp3, tmp3, tmp4);
10096     __ orr(tmp1, tmp1, tmp3);
10097     __ sub(cnt1, cnt1, 2 * wordSize);
10098     __ cbnz(tmp1, NOT_EQUAL);
10099   }
10100 
10101   void generate_large_array_equals_loop_simd(int loopThreshold,
10102         bool usePrefetch, Label &NOT_EQUAL) {
10103     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10104         tmp2 = rscratch2;
10105     Label LOOP;
10106 
10107     __ bind(LOOP);
10108     if (usePrefetch) {
10109       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
10110       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
10111     }
10112     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
10113     __ sub(cnt1, cnt1, 8 * wordSize);
10114     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
10115     __ subs(tmp1, cnt1, loopThreshold);
10116     __ eor(v0, __ T16B, v0, v4);
10117     __ eor(v1, __ T16B, v1, v5);
10118     __ eor(v2, __ T16B, v2, v6);
10119     __ eor(v3, __ T16B, v3, v7);
10120     __ orr(v0, __ T16B, v0, v1);
10121     __ orr(v1, __ T16B, v2, v3);
10122     __ orr(v0, __ T16B, v0, v1);
10123     __ umov(tmp1, v0, __ D, 0);
10124     __ umov(tmp2, v0, __ D, 1);
10125     __ orr(tmp1, tmp1, tmp2);
10126     __ cbnz(tmp1, NOT_EQUAL);
10127     __ br(__ GE, LOOP);
10128   }
10129 
10130   // a1 = r1 - array1 address
10131   // a2 = r2 - array2 address
10132   // result = r0 - return value. Already contains "false"
10133   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
10134   // r3-r5 are reserved temporary registers
10135   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
10136   address generate_large_array_equals() {
10137     StubId stub_id = StubId::stubgen_large_array_equals_id;
10138     int entry_count = StubInfo::entry_count(stub_id);
10139     assert(entry_count == 1, "sanity check");
10140     address start = load_archive_data(stub_id);
10141     if (start != nullptr) {
10142       return start;
10143     }
10144     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10145         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
10146         tmp7 = r12, tmp8 = r13;
10147     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
10148         SMALL_LOOP, POST_LOOP;
10149     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
10150     // calculate if at least 32 prefetched bytes are used
10151     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
10152     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
10153     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
10154     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
10155         tmp5, tmp6, tmp7, tmp8);
10156 
10157     __ align(CodeEntryAlignment);
10158 
10159     StubCodeMark mark(this, stub_id);
10160 
10161     address entry = __ pc();
10162     __ enter();
10163     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
10164     // also advance pointers to use post-increment instead of pre-increment
10165     __ add(a1, a1, wordSize);
10166     __ add(a2, a2, wordSize);
10167     if (AvoidUnalignedAccesses) {
10168       // both implementations (SIMD/nonSIMD) are using relatively large load
10169       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
10170       // on some CPUs in case of address is not at least 16-byte aligned.
10171       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
10172       // load if needed at least for 1st address and make if 16-byte aligned.
10173       Label ALIGNED16;
10174       __ tbz(a1, 3, ALIGNED16);
10175       __ ldr(tmp1, Address(__ post(a1, wordSize)));
10176       __ ldr(tmp2, Address(__ post(a2, wordSize)));
10177       __ sub(cnt1, cnt1, wordSize);
10178       __ eor(tmp1, tmp1, tmp2);
10179       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
10180       __ bind(ALIGNED16);
10181     }
10182     if (UseSIMDForArrayEquals) {
10183       if (SoftwarePrefetchHintDistance >= 0) {
10184         __ subs(tmp1, cnt1, prefetchLoopThreshold);
10185         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
10186         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
10187             /* prfm = */ true, NOT_EQUAL);
10188         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
10189         __ br(__ LT, TAIL);
10190       }
10191       __ bind(NO_PREFETCH_LARGE_LOOP);
10192       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
10193           /* prfm = */ false, NOT_EQUAL);
10194     } else {
10195       __ push(spilled_regs, sp);
10196       if (SoftwarePrefetchHintDistance >= 0) {
10197         __ subs(tmp1, cnt1, prefetchLoopThreshold);
10198         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
10199         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
10200             /* prfm = */ true, NOT_EQUAL);
10201         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
10202         __ br(__ LT, TAIL);
10203       }
10204       __ bind(NO_PREFETCH_LARGE_LOOP);
10205       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
10206           /* prfm = */ false, NOT_EQUAL);
10207     }
10208     __ bind(TAIL);
10209       __ cbz(cnt1, EQUAL);
10210       __ subs(cnt1, cnt1, wordSize);
10211       __ br(__ LE, POST_LOOP);
10212     __ bind(SMALL_LOOP);
10213       __ ldr(tmp1, Address(__ post(a1, wordSize)));
10214       __ ldr(tmp2, Address(__ post(a2, wordSize)));
10215       __ subs(cnt1, cnt1, wordSize);
10216       __ eor(tmp1, tmp1, tmp2);
10217       __ cbnz(tmp1, NOT_EQUAL);
10218       __ br(__ GT, SMALL_LOOP);
10219     __ bind(POST_LOOP);
10220       __ ldr(tmp1, Address(a1, cnt1));
10221       __ ldr(tmp2, Address(a2, cnt1));
10222       __ eor(tmp1, tmp1, tmp2);
10223       __ cbnz(tmp1, NOT_EQUAL);
10224     __ bind(EQUAL);
10225       __ mov(result, true);
10226     __ bind(NOT_EQUAL);
10227       if (!UseSIMDForArrayEquals) {
10228         __ pop(spilled_regs, sp);
10229       }
10230     __ bind(NOT_EQUAL_NO_POP);
10231     __ leave();
10232     __ ret(lr);
10233 
10234     // record the stub entry and end
10235     store_archive_data(stub_id, entry, __ pc());
10236 
10237     return entry;
10238   }
10239 
10240   // result = r0 - return value. Contains initial hashcode value on entry.
10241   // ary = r1 - array address
10242   // cnt = r2 - elements count
10243   // Clobbers: v0-v13, rscratch1, rscratch2
10244   address generate_large_arrays_hashcode(BasicType eltype) {
10245     StubId stub_id;
10246     switch (eltype) {
10247     case T_BOOLEAN:
10248       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
10249       break;
10250     case T_BYTE:
10251       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
10252       break;
10253     case T_CHAR:
10254       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
10255       break;
10256     case T_SHORT:
10257       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
10258       break;
10259     case T_INT:
10260       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
10261       break;
10262     default:
10263       stub_id = StubId::NO_STUBID;
10264       ShouldNotReachHere();
10265     };
10266     int entry_count = StubInfo::entry_count(stub_id);
10267     assert(entry_count == 1, "sanity check");
10268     address start = load_archive_data(stub_id);
10269     if (start != nullptr) {
10270       return start;
10271     }
10272     const Register result = r0, ary = r1, cnt = r2;
10273     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
10274     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
10275     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
10276     const FloatRegister vpowm = v13;
10277 
10278     ARRAYS_HASHCODE_REGISTERS;
10279 
10280     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
10281 
10282     unsigned int vf; // vectorization factor
10283     bool multiply_by_halves;
10284     Assembler::SIMD_Arrangement load_arrangement;
10285     switch (eltype) {
10286     case T_BOOLEAN:
10287     case T_BYTE:
10288       load_arrangement = Assembler::T8B;
10289       multiply_by_halves = true;
10290       vf = 8;
10291       break;
10292     case T_CHAR:
10293     case T_SHORT:
10294       load_arrangement = Assembler::T8H;
10295       multiply_by_halves = true;
10296       vf = 8;
10297       break;
10298     case T_INT:
10299       load_arrangement = Assembler::T4S;
10300       multiply_by_halves = false;
10301       vf = 4;
10302       break;
10303     default:
10304       ShouldNotReachHere();
10305     }
10306 
10307     // Unroll factor
10308     const unsigned uf = 4;
10309 
10310     // Effective vectorization factor
10311     const unsigned evf = vf * uf;
10312 
10313     __ align(CodeEntryAlignment);
10314 
10315     StubCodeMark mark(this, stub_id);
10316 
10317     address entry = __ pc();
10318     __ enter();
10319 
10320     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
10321     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
10322     // value shouldn't change throughout both loops.
10323     __ movw(rscratch1, intpow(31U, 3));
10324     __ mov(vpow, Assembler::S, 0, rscratch1);
10325     __ movw(rscratch1, intpow(31U, 2));
10326     __ mov(vpow, Assembler::S, 1, rscratch1);
10327     __ movw(rscratch1, intpow(31U, 1));
10328     __ mov(vpow, Assembler::S, 2, rscratch1);
10329     __ movw(rscratch1, intpow(31U, 0));
10330     __ mov(vpow, Assembler::S, 3, rscratch1);
10331 
10332     __ mov(vmul0, Assembler::T16B, 0);
10333     __ mov(vmul0, Assembler::S, 3, result);
10334 
10335     __ andr(rscratch2, cnt, (uf - 1) * vf);
10336     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
10337 
10338     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
10339     __ mov(vpowm, Assembler::S, 0, rscratch1);
10340 
10341     // SMALL LOOP
10342     __ bind(SMALL_LOOP);
10343 
10344     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
10345     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10346     __ subsw(rscratch2, rscratch2, vf);
10347 
10348     if (load_arrangement == Assembler::T8B) {
10349       // Extend 8B to 8H to be able to use vector multiply
10350       // instructions
10351       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
10352       if (is_signed_subword_type(eltype)) {
10353         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10354       } else {
10355         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10356       }
10357     }
10358 
10359     switch (load_arrangement) {
10360     case Assembler::T4S:
10361       __ addv(vmul0, load_arrangement, vmul0, vdata0);
10362       break;
10363     case Assembler::T8B:
10364     case Assembler::T8H:
10365       assert(is_subword_type(eltype), "subword type expected");
10366       if (is_signed_subword_type(eltype)) {
10367         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10368       } else {
10369         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10370       }
10371       break;
10372     default:
10373       __ should_not_reach_here();
10374     }
10375 
10376     // Process the upper half of a vector
10377     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
10378       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10379       if (is_signed_subword_type(eltype)) {
10380         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10381       } else {
10382         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10383       }
10384     }
10385 
10386     __ br(Assembler::HI, SMALL_LOOP);
10387 
10388     // SMALL LOOP'S EPILOQUE
10389     __ lsr(rscratch2, cnt, exact_log2(evf));
10390     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
10391 
10392     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
10393     __ addv(vmul0, Assembler::T4S, vmul0);
10394     __ umov(result, vmul0, Assembler::S, 0);
10395 
10396     // TAIL
10397     __ bind(TAIL);
10398 
10399     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
10400     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
10401     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
10402     __ andr(rscratch2, cnt, vf - 1);
10403     __ bind(TAIL_SHORTCUT);
10404     __ adr(rscratch1, BR_BASE);
10405     // For Cortex-A53 offset is 4 because 2 nops are generated.
10406     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
10407     __ movw(rscratch2, 0x1f);
10408     __ br(rscratch1);
10409 
10410     for (size_t i = 0; i < vf - 1; ++i) {
10411       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
10412                                    eltype);
10413       __ maddw(result, result, rscratch2, rscratch1);
10414       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
10415       // Generate 2nd nop to have 4 instructions per iteration.
10416       if (VM_Version::supports_a53mac()) {
10417         __ nop();
10418       }
10419     }
10420     __ bind(BR_BASE);
10421 
10422     __ leave();
10423     __ ret(lr);
10424 
10425     // LARGE LOOP
10426     __ bind(LARGE_LOOP_PREHEADER);
10427 
10428     __ lsr(rscratch2, cnt, exact_log2(evf));
10429 
10430     if (multiply_by_halves) {
10431       // 31^4 - multiplier between lower and upper parts of a register
10432       __ movw(rscratch1, intpow(31U, vf / 2));
10433       __ mov(vpowm, Assembler::S, 1, rscratch1);
10434       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
10435       __ movw(rscratch1, intpow(31U, evf - vf / 2));
10436       __ mov(vpowm, Assembler::S, 0, rscratch1);
10437     } else {
10438       // 31^16
10439       __ movw(rscratch1, intpow(31U, evf));
10440       __ mov(vpowm, Assembler::S, 0, rscratch1);
10441     }
10442 
10443     __ mov(vmul3, Assembler::T16B, 0);
10444     __ mov(vmul2, Assembler::T16B, 0);
10445     __ mov(vmul1, Assembler::T16B, 0);
10446 
10447     __ bind(LARGE_LOOP);
10448 
10449     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
10450     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
10451     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
10452     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10453 
10454     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
10455            Address(__ post(ary, evf * type2aelembytes(eltype))));
10456 
10457     if (load_arrangement == Assembler::T8B) {
10458       // Extend 8B to 8H to be able to use vector multiply
10459       // instructions
10460       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
10461       if (is_signed_subword_type(eltype)) {
10462         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
10463         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
10464         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
10465         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10466       } else {
10467         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
10468         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
10469         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
10470         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10471       }
10472     }
10473 
10474     switch (load_arrangement) {
10475     case Assembler::T4S:
10476       __ addv(vmul3, load_arrangement, vmul3, vdata3);
10477       __ addv(vmul2, load_arrangement, vmul2, vdata2);
10478       __ addv(vmul1, load_arrangement, vmul1, vdata1);
10479       __ addv(vmul0, load_arrangement, vmul0, vdata0);
10480       break;
10481     case Assembler::T8B:
10482     case Assembler::T8H:
10483       assert(is_subword_type(eltype), "subword type expected");
10484       if (is_signed_subword_type(eltype)) {
10485         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
10486         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
10487         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
10488         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10489       } else {
10490         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
10491         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
10492         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
10493         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10494       }
10495       break;
10496     default:
10497       __ should_not_reach_here();
10498     }
10499 
10500     // Process the upper half of a vector
10501     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
10502       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
10503       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
10504       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
10505       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
10506       if (is_signed_subword_type(eltype)) {
10507         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
10508         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
10509         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
10510         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10511       } else {
10512         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
10513         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
10514         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
10515         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10516       }
10517     }
10518 
10519     __ subsw(rscratch2, rscratch2, 1);
10520     __ br(Assembler::HI, LARGE_LOOP);
10521 
10522     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
10523     __ addv(vmul3, Assembler::T4S, vmul3);
10524     __ umov(result, vmul3, Assembler::S, 0);
10525 
10526     __ mov(rscratch2, intpow(31U, vf));
10527 
10528     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
10529     __ addv(vmul2, Assembler::T4S, vmul2);
10530     __ umov(rscratch1, vmul2, Assembler::S, 0);
10531     __ maddw(result, result, rscratch2, rscratch1);
10532 
10533     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
10534     __ addv(vmul1, Assembler::T4S, vmul1);
10535     __ umov(rscratch1, vmul1, Assembler::S, 0);
10536     __ maddw(result, result, rscratch2, rscratch1);
10537 
10538     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
10539     __ addv(vmul0, Assembler::T4S, vmul0);
10540     __ umov(rscratch1, vmul0, Assembler::S, 0);
10541     __ maddw(result, result, rscratch2, rscratch1);
10542 
10543     __ andr(rscratch2, cnt, vf - 1);
10544     __ cbnz(rscratch2, TAIL_SHORTCUT);
10545 
10546     __ leave();
10547     __ ret(lr);
10548 
10549     // record the stub entry and end
10550     store_archive_data(stub_id, entry, __ pc());
10551 
10552     return entry;
10553   }
10554 
10555   address generate_dsin_dcos(bool isCos) {
10556     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
10557     int entry_count = StubInfo::entry_count(stub_id);
10558     assert(entry_count == 1, "sanity check");
10559     address start = load_archive_data(stub_id);
10560     if (start != nullptr) {
10561       return start;
10562     }
10563     __ align(CodeEntryAlignment);
10564     StubCodeMark mark(this, stub_id);
10565     start = __ pc();
10566     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
10567         (address)StubRoutines::aarch64::_two_over_pi,
10568         (address)StubRoutines::aarch64::_pio2,
10569         (address)StubRoutines::aarch64::_dsin_coef,
10570         (address)StubRoutines::aarch64::_dcos_coef);
10571 
10572     // record the stub entry and end
10573     store_archive_data(stub_id, start, __ pc());
10574 
10575     return start;
10576   }
10577 
10578   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
10579   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
10580       Label &DIFF2) {
10581     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
10582     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
10583 
10584     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
10585     __ ldr(tmpU, Address(__ post(cnt1, 8)));
10586     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
10587     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
10588 
10589     __ fmovd(tmpL, vtmp3);
10590     __ eor(rscratch2, tmp3, tmpL);
10591     __ cbnz(rscratch2, DIFF2);
10592 
10593     __ ldr(tmp3, Address(__ post(cnt1, 8)));
10594     __ umov(tmpL, vtmp3, __ D, 1);
10595     __ eor(rscratch2, tmpU, tmpL);
10596     __ cbnz(rscratch2, DIFF1);
10597 
10598     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
10599     __ ldr(tmpU, Address(__ post(cnt1, 8)));
10600     __ fmovd(tmpL, vtmp);
10601     __ eor(rscratch2, tmp3, tmpL);
10602     __ cbnz(rscratch2, DIFF2);
10603 
10604     __ ldr(tmp3, Address(__ post(cnt1, 8)));
10605     __ umov(tmpL, vtmp, __ D, 1);
10606     __ eor(rscratch2, tmpU, tmpL);
10607     __ cbnz(rscratch2, DIFF1);
10608   }
10609 
10610   // r0  = result
10611   // r1  = str1
10612   // r2  = cnt1
10613   // r3  = str2
10614   // r4  = cnt2
10615   // r10 = tmp1
10616   // r11 = tmp2
10617   address generate_compare_long_string_different_encoding(bool isLU) {
10618     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
10619     int entry_count = StubInfo::entry_count(stub_id);
10620     assert(entry_count == 1, "sanity check");
10621     address start = load_archive_data(stub_id);
10622     if (start != nullptr) {
10623       return start;
10624     }
10625     __ align(CodeEntryAlignment);
10626     StubCodeMark mark(this, stub_id);
10627     address entry = __ pc();
10628     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
10629         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
10630         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
10631     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10632         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
10633     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
10634     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
10635 
10636     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
10637 
10638     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
10639     // cnt2 == amount of characters left to compare
10640     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
10641     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
10642     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
10643     __ add(str2, str2, isLU ? wordSize : wordSize/2);
10644     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
10645     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
10646     __ eor(rscratch2, tmp1, tmp2);
10647     __ mov(rscratch1, tmp2);
10648     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
10649     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
10650              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
10651     __ push(spilled_regs, sp);
10652     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
10653     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
10654 
10655     __ ldr(tmp3, Address(__ post(cnt1, 8)));
10656 
10657     if (SoftwarePrefetchHintDistance >= 0) {
10658       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
10659       __ br(__ LT, NO_PREFETCH);
10660       __ bind(LARGE_LOOP_PREFETCH);
10661         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
10662         __ mov(tmp4, 2);
10663         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
10664         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
10665           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10666           __ subs(tmp4, tmp4, 1);
10667           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
10668           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
10669           __ mov(tmp4, 2);
10670         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
10671           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10672           __ subs(tmp4, tmp4, 1);
10673           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
10674           __ sub(cnt2, cnt2, 64);
10675           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
10676           __ br(__ GE, LARGE_LOOP_PREFETCH);
10677     }
10678     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
10679     __ bind(NO_PREFETCH);
10680     __ subs(cnt2, cnt2, 16);
10681     __ br(__ LT, TAIL);
10682     __ align(OptoLoopAlignment);
10683     __ bind(SMALL_LOOP); // smaller loop
10684       __ subs(cnt2, cnt2, 16);
10685       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10686       __ br(__ GE, SMALL_LOOP);
10687       __ cmn(cnt2, (u1)16);
10688       __ br(__ EQ, LOAD_LAST);
10689     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
10690       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
10691       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
10692       __ ldr(tmp3, Address(cnt1, -8));
10693       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
10694       __ b(LOAD_LAST);
10695     __ bind(DIFF2);
10696       __ mov(tmpU, tmp3);
10697     __ bind(DIFF1);
10698       __ pop(spilled_regs, sp);
10699       __ b(CALCULATE_DIFFERENCE);
10700     __ bind(LOAD_LAST);
10701       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
10702       // No need to load it again
10703       __ mov(tmpU, tmp3);
10704       __ pop(spilled_regs, sp);
10705 
10706       // tmp2 points to the address of the last 4 Latin1 characters right now
10707       __ ldrs(vtmp, Address(tmp2));
10708       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
10709       __ fmovd(tmpL, vtmp);
10710 
10711       __ eor(rscratch2, tmpU, tmpL);
10712       __ cbz(rscratch2, DONE);
10713 
10714     // Find the first different characters in the longwords and
10715     // compute their difference.
10716     __ bind(CALCULATE_DIFFERENCE);
10717       __ rev(rscratch2, rscratch2);
10718       __ clz(rscratch2, rscratch2);
10719       __ andr(rscratch2, rscratch2, -16);
10720       __ lsrv(tmp1, tmp1, rscratch2);
10721       __ uxthw(tmp1, tmp1);
10722       __ lsrv(rscratch1, rscratch1, rscratch2);
10723       __ uxthw(rscratch1, rscratch1);
10724       __ subw(result, tmp1, rscratch1);
10725     __ bind(DONE);
10726       __ ret(lr);
10727 
10728       // record the stub entry and end
10729       store_archive_data(stub_id, entry, __ pc());
10730 
10731       return entry;
10732   }
10733 
10734   // r0 = input (float16)
10735   // v0 = result (float)
10736   // v1 = temporary float register
10737   address generate_float16ToFloat() {
10738     StubId stub_id = StubId::stubgen_hf2f_id;
10739     int entry_count = StubInfo::entry_count(stub_id);
10740     assert(entry_count == 1, "sanity check");
10741     address start = load_archive_data(stub_id);
10742     if (start != nullptr) {
10743       return start;
10744     }
10745     __ align(CodeEntryAlignment);
10746     StubCodeMark mark(this, stub_id);
10747     address entry = __ pc();
10748     BLOCK_COMMENT("Entry:");
10749     __ flt16_to_flt(v0, r0, v1);
10750     __ ret(lr);
10751 
10752     // record the stub entry and end
10753     store_archive_data(stub_id, entry, __ pc());
10754 
10755     return entry;
10756   }
10757 
10758   // v0 = input (float)
10759   // r0 = result (float16)
10760   // v1 = temporary float register
10761   address generate_floatToFloat16() {
10762     StubId stub_id = StubId::stubgen_f2hf_id;
10763     int entry_count = StubInfo::entry_count(stub_id);
10764     assert(entry_count == 1, "sanity check");
10765     address start = load_archive_data(stub_id);
10766     if (start != nullptr) {
10767       return start;
10768     }
10769     __ align(CodeEntryAlignment);
10770     StubCodeMark mark(this, stub_id);
10771     address entry = __ pc();
10772     BLOCK_COMMENT("Entry:");
10773     __ flt_to_flt16(r0, v0, v1);
10774     __ ret(lr);
10775 
10776     // record the stub entry and end
10777     store_archive_data(stub_id, entry, __ pc());
10778 
10779     return entry;
10780   }
10781 
10782   address generate_method_entry_barrier() {
10783     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
10784     int entry_count = StubInfo::entry_count(stub_id);
10785     assert(entry_count == 1, "sanity check");
10786     address start = load_archive_data(stub_id);
10787     if (start != nullptr) {
10788       return start;
10789     }
10790     __ align(CodeEntryAlignment);
10791     StubCodeMark mark(this, stub_id);
10792 
10793     Label deoptimize_label;
10794 
10795     start = __ pc();
10796 
10797     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
10798 
10799     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
10800       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
10801       // We can get here despite the nmethod being good, if we have not
10802       // yet applied our cross modification fence (or data fence).
10803       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
10804       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
10805       __ ldrw(rscratch2, rscratch2);
10806       __ strw(rscratch2, thread_epoch_addr);
10807       __ isb();
10808       __ membar(__ LoadLoad);
10809     }
10810 
10811     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
10812 
10813     __ enter();
10814     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
10815 
10816     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
10817 
10818     __ push_call_clobbered_registers();
10819 
10820     __ mov(c_rarg0, rscratch2);
10821     __ call_VM_leaf
10822          (CAST_FROM_FN_PTR
10823           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
10824 
10825     __ reset_last_Java_frame(true);
10826 
10827     __ mov(rscratch1, r0);
10828 
10829     __ pop_call_clobbered_registers();
10830 
10831     __ cbnz(rscratch1, deoptimize_label);
10832 
10833     __ leave();
10834     __ ret(lr);
10835 
10836     __ BIND(deoptimize_label);
10837 
10838     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
10839     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
10840 
10841     __ mov(sp, rscratch1);
10842     __ br(rscratch2);
10843 
10844     // record the stub entry and end
10845     store_archive_data(stub_id, start, __ pc());
10846 
10847     return start;
10848   }
10849 
10850   // r0  = result
10851   // r1  = str1
10852   // r2  = cnt1
10853   // r3  = str2
10854   // r4  = cnt2
10855   // r10 = tmp1
10856   // r11 = tmp2
10857   address generate_compare_long_string_same_encoding(bool isLL) {
10858     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
10859     int entry_count = StubInfo::entry_count(stub_id);
10860     assert(entry_count == 1, "sanity check");
10861     address start = load_archive_data(stub_id);
10862     if (start != nullptr) {
10863       return start;
10864     }
10865     __ align(CodeEntryAlignment);
10866     StubCodeMark mark(this, stub_id);
10867     address entry = __ pc();
10868     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10869         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
10870 
10871     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
10872 
10873     // exit from large loop when less than 64 bytes left to read or we're about
10874     // to prefetch memory behind array border
10875     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
10876 
10877     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
10878     __ eor(rscratch2, tmp1, tmp2);
10879     __ cbnz(rscratch2, CAL_DIFFERENCE);
10880 
10881     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
10882     // update pointers, because of previous read
10883     __ add(str1, str1, wordSize);
10884     __ add(str2, str2, wordSize);
10885     if (SoftwarePrefetchHintDistance >= 0) {
10886       __ align(OptoLoopAlignment);
10887       __ bind(LARGE_LOOP_PREFETCH);
10888         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
10889         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
10890 
10891         for (int i = 0; i < 4; i++) {
10892           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
10893           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
10894           __ cmp(tmp1, tmp2);
10895           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10896           __ br(Assembler::NE, DIFF);
10897         }
10898         __ sub(cnt2, cnt2, isLL ? 64 : 32);
10899         __ add(str1, str1, 64);
10900         __ add(str2, str2, 64);
10901         __ subs(rscratch2, cnt2, largeLoopExitCondition);
10902         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
10903         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
10904     }
10905 
10906     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
10907     __ br(Assembler::LE, LESS16);
10908     __ align(OptoLoopAlignment);
10909     __ bind(LOOP_COMPARE16);
10910       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
10911       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
10912       __ cmp(tmp1, tmp2);
10913       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10914       __ br(Assembler::NE, DIFF);
10915       __ sub(cnt2, cnt2, isLL ? 16 : 8);
10916       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
10917       __ br(Assembler::LT, LESS16);
10918 
10919       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
10920       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
10921       __ cmp(tmp1, tmp2);
10922       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10923       __ br(Assembler::NE, DIFF);
10924       __ sub(cnt2, cnt2, isLL ? 16 : 8);
10925       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
10926       __ br(Assembler::GE, LOOP_COMPARE16);
10927       __ cbz(cnt2, LENGTH_DIFF);
10928 
10929     __ bind(LESS16);
10930       // each 8 compare
10931       __ subs(cnt2, cnt2, isLL ? 8 : 4);
10932       __ br(Assembler::LE, LESS8);
10933       __ ldr(tmp1, Address(__ post(str1, 8)));
10934       __ ldr(tmp2, Address(__ post(str2, 8)));
10935       __ eor(rscratch2, tmp1, tmp2);
10936       __ cbnz(rscratch2, CAL_DIFFERENCE);
10937       __ sub(cnt2, cnt2, isLL ? 8 : 4);
10938 
10939     __ bind(LESS8); // directly load last 8 bytes
10940       if (!isLL) {
10941         __ add(cnt2, cnt2, cnt2);
10942       }
10943       __ ldr(tmp1, Address(str1, cnt2));
10944       __ ldr(tmp2, Address(str2, cnt2));
10945       __ eor(rscratch2, tmp1, tmp2);
10946       __ cbz(rscratch2, LENGTH_DIFF);
10947       __ b(CAL_DIFFERENCE);
10948 
10949     __ bind(DIFF);
10950       __ cmp(tmp1, tmp2);
10951       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
10952       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
10953       // reuse rscratch2 register for the result of eor instruction
10954       __ eor(rscratch2, tmp1, tmp2);
10955 
10956     __ bind(CAL_DIFFERENCE);
10957       __ rev(rscratch2, rscratch2);
10958       __ clz(rscratch2, rscratch2);
10959       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
10960       __ lsrv(tmp1, tmp1, rscratch2);
10961       __ lsrv(tmp2, tmp2, rscratch2);
10962       if (isLL) {
10963         __ uxtbw(tmp1, tmp1);
10964         __ uxtbw(tmp2, tmp2);
10965       } else {
10966         __ uxthw(tmp1, tmp1);
10967         __ uxthw(tmp2, tmp2);
10968       }
10969       __ subw(result, tmp1, tmp2);
10970 
10971     __ bind(LENGTH_DIFF);
10972       __ ret(lr);
10973 
10974     // record the stub entry and end
10975     store_archive_data(stub_id, entry, __ pc());
10976 
10977     return entry;
10978   }
10979 
10980   enum string_compare_mode {
10981     LL,
10982     LU,
10983     UL,
10984     UU,
10985   };
10986 
10987   // The following registers are declared in aarch64.ad
10988   // r0  = result
10989   // r1  = str1
10990   // r2  = cnt1
10991   // r3  = str2
10992   // r4  = cnt2
10993   // r10 = tmp1
10994   // r11 = tmp2
10995   // z0  = ztmp1
10996   // z1  = ztmp2
10997   // p0  = pgtmp1
10998   // p1  = pgtmp2
10999   address generate_compare_long_string_sve(string_compare_mode mode) {
11000     StubId stub_id;
11001     switch (mode) {
11002       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
11003       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
11004       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
11005       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
11006       default: ShouldNotReachHere();
11007     }
11008     int entry_count = StubInfo::entry_count(stub_id);
11009     assert(entry_count == 1, "sanity check");
11010     address start = load_archive_data(stub_id);
11011     if (start != nullptr) {
11012       return start;
11013     }
11014     __ align(CodeEntryAlignment);
11015     StubCodeMark mark(this, stub_id);
11016     address entry = __ pc();
11017     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
11018              tmp1 = r10, tmp2 = r11;
11019 
11020     Label LOOP, DONE, MISMATCH;
11021     Register vec_len = tmp1;
11022     Register idx = tmp2;
11023     // The minimum of the string lengths has been stored in cnt2.
11024     Register cnt = cnt2;
11025     FloatRegister ztmp1 = z0, ztmp2 = z1;
11026     PRegister pgtmp1 = p0, pgtmp2 = p1;
11027 
11028 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
11029     switch (mode) {                                                            \
11030       case LL:                                                                 \
11031         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
11032         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
11033         break;                                                                 \
11034       case LU:                                                                 \
11035         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
11036         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
11037         break;                                                                 \
11038       case UL:                                                                 \
11039         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
11040         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
11041         break;                                                                 \
11042       case UU:                                                                 \
11043         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
11044         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
11045         break;                                                                 \
11046       default:                                                                 \
11047         ShouldNotReachHere();                                                  \
11048     }
11049 
11050     __ mov(idx, 0);
11051     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
11052 
11053     if (mode == LL) {
11054       __ sve_cntb(vec_len);
11055     } else {
11056       __ sve_cnth(vec_len);
11057     }
11058 
11059     __ sub(rscratch1, cnt, vec_len);
11060 
11061     __ bind(LOOP);
11062 
11063       // main loop
11064       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
11065       __ add(idx, idx, vec_len);
11066       // Compare strings.
11067       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
11068       __ br(__ NE, MISMATCH);
11069       __ cmp(idx, rscratch1);
11070       __ br(__ LT, LOOP);
11071 
11072     // post loop, last iteration
11073     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
11074 
11075     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
11076     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
11077     __ br(__ EQ, DONE);
11078 
11079     __ bind(MISMATCH);
11080 
11081     // Crop the vector to find its location.
11082     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
11083     // Extract the first different characters of each string.
11084     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
11085     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
11086 
11087     // Compute the difference of the first different characters.
11088     __ sub(result, rscratch1, rscratch2);
11089 
11090     __ bind(DONE);
11091     __ ret(lr);
11092 #undef LOAD_PAIR
11093 
11094     // record the stub entry and end
11095     store_archive_data(stub_id, entry, __ pc());
11096 
11097     return entry;
11098   }
11099 
11100   void generate_compare_long_strings() {
11101     if (UseSVE == 0) {
11102       StubRoutines::aarch64::_compare_long_string_LL
11103           = generate_compare_long_string_same_encoding(true);
11104       StubRoutines::aarch64::_compare_long_string_UU
11105           = generate_compare_long_string_same_encoding(false);
11106       StubRoutines::aarch64::_compare_long_string_LU
11107           = generate_compare_long_string_different_encoding(true);
11108       StubRoutines::aarch64::_compare_long_string_UL
11109           = generate_compare_long_string_different_encoding(false);
11110     } else {
11111       StubRoutines::aarch64::_compare_long_string_LL
11112           = generate_compare_long_string_sve(LL);
11113       StubRoutines::aarch64::_compare_long_string_UU
11114           = generate_compare_long_string_sve(UU);
11115       StubRoutines::aarch64::_compare_long_string_LU
11116           = generate_compare_long_string_sve(LU);
11117       StubRoutines::aarch64::_compare_long_string_UL
11118           = generate_compare_long_string_sve(UL);
11119     }
11120   }
11121 
11122   // R0 = result
11123   // R1 = str2
11124   // R2 = cnt1
11125   // R3 = str1
11126   // R4 = cnt2
11127   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
11128   //
11129   // This generic linear code use few additional ideas, which makes it faster:
11130   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
11131   // in order to skip initial loading(help in systems with 1 ld pipeline)
11132   // 2) we can use "fast" algorithm of finding single character to search for
11133   // first symbol with less branches(1 branch per each loaded register instead
11134   // of branch for each symbol), so, this is where constants like
11135   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
11136   // 3) after loading and analyzing 1st register of source string, it can be
11137   // used to search for every 1st character entry, saving few loads in
11138   // comparison with "simplier-but-slower" implementation
11139   // 4) in order to avoid lots of push/pop operations, code below is heavily
11140   // re-using/re-initializing/compressing register values, which makes code
11141   // larger and a bit less readable, however, most of extra operations are
11142   // issued during loads or branches, so, penalty is minimal
11143   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
11144     StubId stub_id;
11145     if (str1_isL) {
11146       if (str2_isL) {
11147         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
11148       } else {
11149         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
11150       }
11151     } else {
11152       if (str2_isL) {
11153         ShouldNotReachHere();
11154       } else {
11155         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
11156       }
11157     }
11158     int entry_count = StubInfo::entry_count(stub_id);
11159     assert(entry_count == 1, "sanity check");
11160     address start = load_archive_data(stub_id);
11161     if (start != nullptr) {
11162       return start;
11163     }
11164     __ align(CodeEntryAlignment);
11165     StubCodeMark mark(this, stub_id);
11166     address entry = __ pc();
11167 
11168     int str1_chr_size = str1_isL ? 1 : 2;
11169     int str2_chr_size = str2_isL ? 1 : 2;
11170     int str1_chr_shift = str1_isL ? 0 : 1;
11171     int str2_chr_shift = str2_isL ? 0 : 1;
11172     bool isL = str1_isL && str2_isL;
11173    // parameters
11174     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
11175     // temporary registers
11176     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
11177     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
11178     // redefinitions
11179     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
11180 
11181     __ push(spilled_regs, sp);
11182     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
11183         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
11184         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
11185         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
11186         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
11187         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
11188     // Read whole register from str1. It is safe, because length >=8 here
11189     __ ldr(ch1, Address(str1));
11190     // Read whole register from str2. It is safe, because length >=8 here
11191     __ ldr(ch2, Address(str2));
11192     __ sub(cnt2, cnt2, cnt1);
11193     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
11194     if (str1_isL != str2_isL) {
11195       __ eor(v0, __ T16B, v0, v0);
11196     }
11197     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
11198     __ mul(first, first, tmp1);
11199     // check if we have less than 1 register to check
11200     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
11201     if (str1_isL != str2_isL) {
11202       __ fmovd(v1, ch1);
11203     }
11204     __ br(__ LE, L_SMALL);
11205     __ eor(ch2, first, ch2);
11206     if (str1_isL != str2_isL) {
11207       __ zip1(v1, __ T16B, v1, v0);
11208     }
11209     __ sub(tmp2, ch2, tmp1);
11210     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11211     __ bics(tmp2, tmp2, ch2);
11212     if (str1_isL != str2_isL) {
11213       __ fmovd(ch1, v1);
11214     }
11215     __ br(__ NE, L_HAS_ZERO);
11216     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
11217     __ add(result, result, wordSize/str2_chr_size);
11218     __ add(str2, str2, wordSize);
11219     __ br(__ LT, L_POST_LOOP);
11220     __ BIND(L_LOOP);
11221       __ ldr(ch2, Address(str2));
11222       __ eor(ch2, first, ch2);
11223       __ sub(tmp2, ch2, tmp1);
11224       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11225       __ bics(tmp2, tmp2, ch2);
11226       __ br(__ NE, L_HAS_ZERO);
11227     __ BIND(L_LOOP_PROCEED);
11228       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
11229       __ add(str2, str2, wordSize);
11230       __ add(result, result, wordSize/str2_chr_size);
11231       __ br(__ GE, L_LOOP);
11232     __ BIND(L_POST_LOOP);
11233       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
11234       __ br(__ LE, NOMATCH);
11235       __ ldr(ch2, Address(str2));
11236       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
11237       __ eor(ch2, first, ch2);
11238       __ sub(tmp2, ch2, tmp1);
11239       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11240       __ mov(tmp4, -1); // all bits set
11241       __ b(L_SMALL_PROCEED);
11242     __ align(OptoLoopAlignment);
11243     __ BIND(L_SMALL);
11244       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
11245       __ eor(ch2, first, ch2);
11246       if (str1_isL != str2_isL) {
11247         __ zip1(v1, __ T16B, v1, v0);
11248       }
11249       __ sub(tmp2, ch2, tmp1);
11250       __ mov(tmp4, -1); // all bits set
11251       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11252       if (str1_isL != str2_isL) {
11253         __ fmovd(ch1, v1); // move converted 4 symbols
11254       }
11255     __ BIND(L_SMALL_PROCEED);
11256       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
11257       __ bic(tmp2, tmp2, ch2);
11258       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
11259       __ rbit(tmp2, tmp2);
11260       __ br(__ EQ, NOMATCH);
11261     __ BIND(L_SMALL_HAS_ZERO_LOOP);
11262       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
11263       __ cmp(cnt1, u1(wordSize/str2_chr_size));
11264       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
11265       if (str2_isL) { // LL
11266         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
11267         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
11268         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
11269         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
11270         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11271       } else {
11272         __ mov(ch2, 0xE); // all bits in byte set except last one
11273         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11274         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11275         __ lslv(tmp2, tmp2, tmp4);
11276         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11277         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11278         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11279         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11280       }
11281       __ cmp(ch1, ch2);
11282       __ mov(tmp4, wordSize/str2_chr_size);
11283       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11284     __ BIND(L_SMALL_CMP_LOOP);
11285       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
11286                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
11287       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
11288                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
11289       __ add(tmp4, tmp4, 1);
11290       __ cmp(tmp4, cnt1);
11291       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
11292       __ cmp(first, ch2);
11293       __ br(__ EQ, L_SMALL_CMP_LOOP);
11294     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
11295       __ cbz(tmp2, NOMATCH); // no more matches. exit
11296       __ clz(tmp4, tmp2);
11297       __ add(result, result, 1); // advance index
11298       __ add(str2, str2, str2_chr_size); // advance pointer
11299       __ b(L_SMALL_HAS_ZERO_LOOP);
11300     __ align(OptoLoopAlignment);
11301     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
11302       __ cmp(first, ch2);
11303       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11304       __ b(DONE);
11305     __ align(OptoLoopAlignment);
11306     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
11307       if (str2_isL) { // LL
11308         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
11309         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
11310         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
11311         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
11312         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11313       } else {
11314         __ mov(ch2, 0xE); // all bits in byte set except last one
11315         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11316         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11317         __ lslv(tmp2, tmp2, tmp4);
11318         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11319         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11320         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11321         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11322       }
11323       __ cmp(ch1, ch2);
11324       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11325       __ b(DONE);
11326     __ align(OptoLoopAlignment);
11327     __ BIND(L_HAS_ZERO);
11328       __ rbit(tmp2, tmp2);
11329       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
11330       // Now, perform compression of counters(cnt2 and cnt1) into one register.
11331       // It's fine because both counters are 32bit and are not changed in this
11332       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
11333       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
11334       __ sub(result, result, 1);
11335     __ BIND(L_HAS_ZERO_LOOP);
11336       __ mov(cnt1, wordSize/str2_chr_size);
11337       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
11338       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
11339       if (str2_isL) {
11340         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
11341         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11342         __ lslv(tmp2, tmp2, tmp4);
11343         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11344         __ add(tmp4, tmp4, 1);
11345         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11346         __ lsl(tmp2, tmp2, 1);
11347         __ mov(tmp4, wordSize/str2_chr_size);
11348       } else {
11349         __ mov(ch2, 0xE);
11350         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11351         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11352         __ lslv(tmp2, tmp2, tmp4);
11353         __ add(tmp4, tmp4, 1);
11354         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11355         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
11356         __ lsl(tmp2, tmp2, 1);
11357         __ mov(tmp4, wordSize/str2_chr_size);
11358         __ sub(str2, str2, str2_chr_size);
11359       }
11360       __ cmp(ch1, ch2);
11361       __ mov(tmp4, wordSize/str2_chr_size);
11362       __ br(__ NE, L_CMP_LOOP_NOMATCH);
11363     __ BIND(L_CMP_LOOP);
11364       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
11365                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
11366       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
11367                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
11368       __ add(tmp4, tmp4, 1);
11369       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
11370       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
11371       __ cmp(cnt1, ch2);
11372       __ br(__ EQ, L_CMP_LOOP);
11373     __ BIND(L_CMP_LOOP_NOMATCH);
11374       // here we're not matched
11375       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
11376       __ clz(tmp4, tmp2);
11377       __ add(str2, str2, str2_chr_size); // advance pointer
11378       __ b(L_HAS_ZERO_LOOP);
11379     __ align(OptoLoopAlignment);
11380     __ BIND(L_CMP_LOOP_LAST_CMP);
11381       __ cmp(cnt1, ch2);
11382       __ br(__ NE, L_CMP_LOOP_NOMATCH);
11383       __ b(DONE);
11384     __ align(OptoLoopAlignment);
11385     __ BIND(L_CMP_LOOP_LAST_CMP2);
11386       if (str2_isL) {
11387         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
11388         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11389         __ lslv(tmp2, tmp2, tmp4);
11390         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11391         __ add(tmp4, tmp4, 1);
11392         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11393         __ lsl(tmp2, tmp2, 1);
11394       } else {
11395         __ mov(ch2, 0xE);
11396         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11397         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11398         __ lslv(tmp2, tmp2, tmp4);
11399         __ add(tmp4, tmp4, 1);
11400         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11401         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
11402         __ lsl(tmp2, tmp2, 1);
11403         __ sub(str2, str2, str2_chr_size);
11404       }
11405       __ cmp(ch1, ch2);
11406       __ br(__ NE, L_CMP_LOOP_NOMATCH);
11407       __ b(DONE);
11408     __ align(OptoLoopAlignment);
11409     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
11410       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
11411       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
11412       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
11413       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
11414       // result by analyzed characters value, so, we can just reset lower bits
11415       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
11416       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
11417       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
11418       // index of last analyzed substring inside current octet. So, str2 in at
11419       // respective start address. We need to advance it to next octet
11420       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
11421       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
11422       __ bfm(result, zr, 0, 2 - str2_chr_shift);
11423       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
11424       __ movw(cnt2, cnt2);
11425       __ b(L_LOOP_PROCEED);
11426     __ align(OptoLoopAlignment);
11427     __ BIND(NOMATCH);
11428       __ mov(result, -1);
11429     __ BIND(DONE);
11430       __ pop(spilled_regs, sp);
11431       __ ret(lr);
11432 
11433     // record the stub entry and end
11434     store_archive_data(stub_id, entry, __ pc());
11435 
11436     return entry;
11437   }
11438 
11439   void generate_string_indexof_stubs() {
11440     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
11441     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
11442     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
11443   }
11444 
11445   void inflate_and_store_2_fp_registers(bool generatePrfm,
11446       FloatRegister src1, FloatRegister src2) {
11447     Register dst = r1;
11448     __ zip1(v1, __ T16B, src1, v0);
11449     __ zip2(v2, __ T16B, src1, v0);
11450     if (generatePrfm) {
11451       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
11452     }
11453     __ zip1(v3, __ T16B, src2, v0);
11454     __ zip2(v4, __ T16B, src2, v0);
11455     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
11456   }
11457 
11458   // R0 = src
11459   // R1 = dst
11460   // R2 = len
11461   // R3 = len >> 3
11462   // V0 = 0
11463   // v1 = loaded 8 bytes
11464   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
11465   address generate_large_byte_array_inflate() {
11466     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
11467     int entry_count = StubInfo::entry_count(stub_id);
11468     assert(entry_count == 1, "sanity check");
11469     address start = load_archive_data(stub_id);
11470     if (start != nullptr) {
11471       return start;
11472     }
11473     __ align(CodeEntryAlignment);
11474     StubCodeMark mark(this, stub_id);
11475     address entry = __ pc();
11476     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
11477     Register src = r0, dst = r1, len = r2, octetCounter = r3;
11478     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
11479 
11480     // do one more 8-byte read to have address 16-byte aligned in most cases
11481     // also use single store instruction
11482     __ ldrd(v2, __ post(src, 8));
11483     __ sub(octetCounter, octetCounter, 2);
11484     __ zip1(v1, __ T16B, v1, v0);
11485     __ zip1(v2, __ T16B, v2, v0);
11486     __ st1(v1, v2, __ T16B, __ post(dst, 32));
11487     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11488     __ subs(rscratch1, octetCounter, large_loop_threshold);
11489     __ br(__ LE, LOOP_START);
11490     __ b(LOOP_PRFM_START);
11491     __ bind(LOOP_PRFM);
11492       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11493     __ bind(LOOP_PRFM_START);
11494       __ prfm(Address(src, SoftwarePrefetchHintDistance));
11495       __ sub(octetCounter, octetCounter, 8);
11496       __ subs(rscratch1, octetCounter, large_loop_threshold);
11497       inflate_and_store_2_fp_registers(true, v3, v4);
11498       inflate_and_store_2_fp_registers(true, v5, v6);
11499       __ br(__ GT, LOOP_PRFM);
11500       __ cmp(octetCounter, (u1)8);
11501       __ br(__ LT, DONE);
11502     __ bind(LOOP);
11503       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11504       __ bind(LOOP_START);
11505       __ sub(octetCounter, octetCounter, 8);
11506       __ cmp(octetCounter, (u1)8);
11507       inflate_and_store_2_fp_registers(false, v3, v4);
11508       inflate_and_store_2_fp_registers(false, v5, v6);
11509       __ br(__ GE, LOOP);
11510     __ bind(DONE);
11511       __ ret(lr);
11512 
11513     // record the stub entry and end
11514     store_archive_data(stub_id, entry, __ pc());
11515 
11516     return entry;
11517   }
11518 
11519   /**
11520    *  Arguments:
11521    *
11522    *  Input:
11523    *  c_rarg0   - current state address
11524    *  c_rarg1   - H key address
11525    *  c_rarg2   - data address
11526    *  c_rarg3   - number of blocks
11527    *
11528    *  Output:
11529    *  Updated state at c_rarg0
11530    */
11531   address generate_ghash_processBlocks_small() {
11532     // Bafflingly, GCM uses little-endian for the byte order, but
11533     // big-endian for the bit order.  For example, the polynomial 1 is
11534     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
11535     //
11536     // So, we must either reverse the bytes in each word and do
11537     // everything big-endian or reverse the bits in each byte and do
11538     // it little-endian.  On AArch64 it's more idiomatic to reverse
11539     // the bits in each byte (we have an instruction, RBIT, to do
11540     // that) and keep the data in little-endian bit order through the
11541     // calculation, bit-reversing the inputs and outputs.
11542 
11543     StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
11544     int entry_count = StubInfo::entry_count(stub_id);
11545     assert(entry_count == 1, "sanity check");
11546     address start = load_archive_data(stub_id);
11547     if (start != nullptr) {
11548       return start;
11549     }
11550     __ align(CodeEntryAlignment);
11551     StubCodeMark mark(this, stub_id);
11552     Label polynomial; // local data generated at end of stub
11553     start = __ pc();
11554 
11555     Register state   = c_rarg0;
11556     Register subkeyH = c_rarg1;
11557     Register data    = c_rarg2;
11558     Register blocks  = c_rarg3;
11559 
11560     FloatRegister vzr = v30;
11561     __ eor(vzr, __ T16B, vzr, vzr); // zero register
11562 
11563     __ adr(rscratch1, polynomial);
11564     __ ldrq(v24, rscratch1);    // The field polynomial
11565 
11566     __ ldrq(v0, Address(state));
11567     __ ldrq(v1, Address(subkeyH));
11568 
11569     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
11570     __ rbit(v0, __ T16B, v0);
11571     __ rev64(v1, __ T16B, v1);
11572     __ rbit(v1, __ T16B, v1);
11573 
11574     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
11575     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
11576 
11577     {
11578       Label L_ghash_loop;
11579       __ bind(L_ghash_loop);
11580 
11581       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
11582                                                  // reversing each byte
11583       __ rbit(v2, __ T16B, v2);
11584       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
11585 
11586       // Multiply state in v2 by subkey in v1
11587       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
11588                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
11589                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
11590       // Reduce v7:v5 by the field polynomial
11591       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
11592 
11593       __ sub(blocks, blocks, 1);
11594       __ cbnz(blocks, L_ghash_loop);
11595     }
11596 
11597     // The bit-reversed result is at this point in v0
11598     __ rev64(v0, __ T16B, v0);
11599     __ rbit(v0, __ T16B, v0);
11600 
11601     __ st1(v0, __ T16B, state);
11602     __ ret(lr);
11603 
11604     // bind label and generate local polynomial data
11605     __ align(wordSize * 2);
11606     __ bind(polynomial);
11607     __ emit_int64(0x87);  // The low-order bits of the field
11608                           // polynomial (i.e. p = z^7+z^2+z+1)
11609                           // repeated in the low and high parts of a
11610                           // 128-bit vector
11611     __ emit_int64(0x87);
11612 
11613     // record the stub entry and end
11614     store_archive_data(stub_id, start, __ pc());
11615 
11616     return start;
11617   }
11618 
11619   address generate_ghash_processBlocks(address small) {
11620     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
11621     int entry_count = StubInfo::entry_count(stub_id);
11622     assert(entry_count == 1, "sanity check");
11623     address start = load_archive_data(stub_id);
11624     if (start != nullptr) {
11625       return start;
11626     }
11627     Label polynomial;           // local data generated after stub
11628     __ align(CodeEntryAlignment);
11629     StubCodeMark mark(this, stub_id);
11630     start = __ pc();
11631 
11632     Register state   = c_rarg0;
11633     Register subkeyH = c_rarg1;
11634     Register data    = c_rarg2;
11635     Register blocks  = c_rarg3;
11636 
11637     const int unroll = 4;
11638 
11639     __ cmp(blocks, (unsigned char)(unroll * 2));
11640     __ br(__ LT, small);
11641 
11642     if (unroll > 1) {
11643     // Save state before entering routine
11644       __ sub(sp, sp, 4 * 16);
11645       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
11646       __ sub(sp, sp, 4 * 16);
11647       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
11648     }
11649 
11650     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
11651 
11652     if (unroll > 1) {
11653       // And restore state
11654       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
11655       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
11656     }
11657 
11658     __ cmp(blocks, (unsigned char)0);
11659     __ br(__ GT, small);
11660 
11661     __ ret(lr);
11662 
11663     // bind label and generate polynomial data
11664     __ align(wordSize * 2);
11665     __ bind(polynomial);
11666     __ emit_int64(0x87);  // The low-order bits of the field
11667                           // polynomial (i.e. p = z^7+z^2+z+1)
11668                           // repeated in the low and high parts of a
11669                           // 128-bit vector
11670     __ emit_int64(0x87);
11671 
11672     // record the stub entry and end
11673     store_archive_data(stub_id, start, __ pc());
11674 
11675     return start;
11676   }
11677 
11678   void generate_base64_encode_simdround(Register src, Register dst,
11679         FloatRegister codec, u8 size) {
11680 
11681     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
11682     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
11683     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
11684 
11685     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
11686 
11687     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
11688 
11689     __ ushr(ind0, arrangement, in0,  2);
11690 
11691     __ ushr(ind1, arrangement, in1,  2);
11692     __ shl(in0,   arrangement, in0,  6);
11693     __ orr(ind1,  arrangement, ind1, in0);
11694     __ ushr(ind1, arrangement, ind1, 2);
11695 
11696     __ ushr(ind2, arrangement, in2,  4);
11697     __ shl(in1,   arrangement, in1,  4);
11698     __ orr(ind2,  arrangement, in1,  ind2);
11699     __ ushr(ind2, arrangement, ind2, 2);
11700 
11701     __ shl(ind3,  arrangement, in2,  2);
11702     __ ushr(ind3, arrangement, ind3, 2);
11703 
11704     __ tbl(out0,  arrangement, codec,  4, ind0);
11705     __ tbl(out1,  arrangement, codec,  4, ind1);
11706     __ tbl(out2,  arrangement, codec,  4, ind2);
11707     __ tbl(out3,  arrangement, codec,  4, ind3);
11708 
11709     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
11710   }
11711 
11712    /**
11713    *  Arguments:
11714    *
11715    *  Input:
11716    *  c_rarg0   - src_start
11717    *  c_rarg1   - src_offset
11718    *  c_rarg2   - src_length
11719    *  c_rarg3   - dest_start
11720    *  c_rarg4   - dest_offset
11721    *  c_rarg5   - isURL
11722    *
11723    */
11724   address generate_base64_encodeBlock() {
11725 
11726     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
11727     int entry_count = StubInfo::entry_count(stub_id);
11728     assert(entry_count == 1, "sanity check");
11729     address start = load_archive_data(stub_id);
11730     if (start != nullptr) {
11731       return start;
11732     }
11733     __ align(CodeEntryAlignment);
11734     StubCodeMark mark(this, stub_id);
11735     start = __ pc();
11736 
11737     Register src   = c_rarg0;  // source array
11738     Register soff  = c_rarg1;  // source start offset
11739     Register send  = c_rarg2;  // source end offset
11740     Register dst   = c_rarg3;  // dest array
11741     Register doff  = c_rarg4;  // position for writing to dest array
11742     Register isURL = c_rarg5;  // Base64 or URL character set
11743 
11744     // c_rarg6 and c_rarg7 are free to use as temps
11745     Register codec  = c_rarg6;
11746     Register length = c_rarg7;
11747 
11748     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
11749 
11750     __ add(src, src, soff);
11751     __ add(dst, dst, doff);
11752     __ sub(length, send, soff);
11753 
11754     // load the codec base address
11755     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
11756     __ cbz(isURL, ProcessData);
11757     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
11758 
11759     __ BIND(ProcessData);
11760 
11761     // too short to formup a SIMD loop, roll back
11762     __ cmp(length, (u1)24);
11763     __ br(Assembler::LT, Process3B);
11764 
11765     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
11766 
11767     __ BIND(Process48B);
11768     __ cmp(length, (u1)48);
11769     __ br(Assembler::LT, Process24B);
11770     generate_base64_encode_simdround(src, dst, v0, 16);
11771     __ sub(length, length, 48);
11772     __ b(Process48B);
11773 
11774     __ BIND(Process24B);
11775     __ cmp(length, (u1)24);
11776     __ br(Assembler::LT, SIMDExit);
11777     generate_base64_encode_simdround(src, dst, v0, 8);
11778     __ sub(length, length, 24);
11779 
11780     __ BIND(SIMDExit);
11781     __ cbz(length, Exit);
11782 
11783     __ BIND(Process3B);
11784     //  3 src bytes, 24 bits
11785     __ ldrb(r10, __ post(src, 1));
11786     __ ldrb(r11, __ post(src, 1));
11787     __ ldrb(r12, __ post(src, 1));
11788     __ orrw(r11, r11, r10, Assembler::LSL, 8);
11789     __ orrw(r12, r12, r11, Assembler::LSL, 8);
11790     // codec index
11791     __ ubfmw(r15, r12, 18, 23);
11792     __ ubfmw(r14, r12, 12, 17);
11793     __ ubfmw(r13, r12, 6,  11);
11794     __ andw(r12,  r12, 63);
11795     // get the code based on the codec
11796     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
11797     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
11798     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
11799     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
11800     __ strb(r15, __ post(dst, 1));
11801     __ strb(r14, __ post(dst, 1));
11802     __ strb(r13, __ post(dst, 1));
11803     __ strb(r12, __ post(dst, 1));
11804     __ sub(length, length, 3);
11805     __ cbnz(length, Process3B);
11806 
11807     __ BIND(Exit);
11808     __ ret(lr);
11809 
11810     // record the stub entry and end
11811     store_archive_data(stub_id, start, __ pc());
11812 
11813     return start;
11814   }
11815 
11816   void generate_base64_decode_simdround(Register src, Register dst,
11817         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
11818 
11819     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
11820     FloatRegister out0 = v20, out1 = v21, out2 = v22;
11821 
11822     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
11823     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
11824 
11825     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
11826 
11827     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
11828 
11829     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
11830 
11831     // we need unsigned saturating subtract, to make sure all input values
11832     // in range [0, 63] will have 0U value in the higher half lookup
11833     __ uqsubv(decH0, __ T16B, in0, v27);
11834     __ uqsubv(decH1, __ T16B, in1, v27);
11835     __ uqsubv(decH2, __ T16B, in2, v27);
11836     __ uqsubv(decH3, __ T16B, in3, v27);
11837 
11838     // lower half lookup
11839     __ tbl(decL0, arrangement, codecL, 4, in0);
11840     __ tbl(decL1, arrangement, codecL, 4, in1);
11841     __ tbl(decL2, arrangement, codecL, 4, in2);
11842     __ tbl(decL3, arrangement, codecL, 4, in3);
11843 
11844     // higher half lookup
11845     __ tbx(decH0, arrangement, codecH, 4, decH0);
11846     __ tbx(decH1, arrangement, codecH, 4, decH1);
11847     __ tbx(decH2, arrangement, codecH, 4, decH2);
11848     __ tbx(decH3, arrangement, codecH, 4, decH3);
11849 
11850     // combine lower and higher
11851     __ orr(decL0, arrangement, decL0, decH0);
11852     __ orr(decL1, arrangement, decL1, decH1);
11853     __ orr(decL2, arrangement, decL2, decH2);
11854     __ orr(decL3, arrangement, decL3, decH3);
11855 
11856     // check illegal inputs, value larger than 63 (maximum of 6 bits)
11857     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
11858     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
11859     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
11860     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
11861     __ orr(in0, arrangement, decH0, decH1);
11862     __ orr(in1, arrangement, decH2, decH3);
11863     __ orr(in2, arrangement, in0,   in1);
11864     __ umaxv(in3, arrangement, in2);
11865     __ umov(rscratch2, in3, __ B, 0);
11866 
11867     // get the data to output
11868     __ shl(out0,  arrangement, decL0, 2);
11869     __ ushr(out1, arrangement, decL1, 4);
11870     __ orr(out0,  arrangement, out0,  out1);
11871     __ shl(out1,  arrangement, decL1, 4);
11872     __ ushr(out2, arrangement, decL2, 2);
11873     __ orr(out1,  arrangement, out1,  out2);
11874     __ shl(out2,  arrangement, decL2, 6);
11875     __ orr(out2,  arrangement, out2,  decL3);
11876 
11877     __ cbz(rscratch2, NoIllegalData);
11878 
11879     // handle illegal input
11880     __ umov(r10, in2, __ D, 0);
11881     if (size == 16) {
11882       __ cbnz(r10, ErrorInLowerHalf);
11883 
11884       // illegal input is in higher half, store the lower half now.
11885       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
11886 
11887       __ umov(r10, in2,  __ D, 1);
11888       __ umov(r11, out0, __ D, 1);
11889       __ umov(r12, out1, __ D, 1);
11890       __ umov(r13, out2, __ D, 1);
11891       __ b(StoreLegalData);
11892 
11893       __ BIND(ErrorInLowerHalf);
11894     }
11895     __ umov(r11, out0, __ D, 0);
11896     __ umov(r12, out1, __ D, 0);
11897     __ umov(r13, out2, __ D, 0);
11898 
11899     __ BIND(StoreLegalData);
11900     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
11901     __ strb(r11, __ post(dst, 1));
11902     __ strb(r12, __ post(dst, 1));
11903     __ strb(r13, __ post(dst, 1));
11904     __ lsr(r10, r10, 8);
11905     __ lsr(r11, r11, 8);
11906     __ lsr(r12, r12, 8);
11907     __ lsr(r13, r13, 8);
11908     __ b(StoreLegalData);
11909 
11910     __ BIND(NoIllegalData);
11911     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
11912   }
11913 
11914 
11915    /**
11916    *  Arguments:
11917    *
11918    *  Input:
11919    *  c_rarg0   - src_start
11920    *  c_rarg1   - src_offset
11921    *  c_rarg2   - src_length
11922    *  c_rarg3   - dest_start
11923    *  c_rarg4   - dest_offset
11924    *  c_rarg5   - isURL
11925    *  c_rarg6   - isMIME
11926    *
11927    */
11928   address generate_base64_decodeBlock() {
11929 
11930     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
11931     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
11932     // titled "Base64 decoding".
11933 
11934     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
11935     int entry_count = StubInfo::entry_count(stub_id);
11936     assert(entry_count == 1, "sanity check");
11937     address start = load_archive_data(stub_id);
11938     if (start != nullptr) {
11939       return start;
11940     }
11941     __ align(CodeEntryAlignment);
11942     StubCodeMark mark(this, stub_id);
11943     start = __ pc();
11944 
11945     Register src    = c_rarg0;  // source array
11946     Register soff   = c_rarg1;  // source start offset
11947     Register send   = c_rarg2;  // source end offset
11948     Register dst    = c_rarg3;  // dest array
11949     Register doff   = c_rarg4;  // position for writing to dest array
11950     Register isURL  = c_rarg5;  // Base64 or URL character set
11951     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
11952 
11953     Register length = send;    // reuse send as length of source data to process
11954 
11955     Register simd_codec   = c_rarg6;
11956     Register nosimd_codec = c_rarg7;
11957 
11958     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
11959 
11960     __ enter();
11961 
11962     __ add(src, src, soff);
11963     __ add(dst, dst, doff);
11964 
11965     __ mov(doff, dst);
11966 
11967     __ sub(length, send, soff);
11968     __ bfm(length, zr, 0, 1);
11969 
11970     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
11971     __ cbz(isURL, ProcessData);
11972     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
11973 
11974     __ BIND(ProcessData);
11975     __ mov(rscratch1, length);
11976     __ cmp(length, (u1)144); // 144 = 80 + 64
11977     __ br(Assembler::LT, Process4B);
11978 
11979     // In the MIME case, the line length cannot be more than 76
11980     // bytes (see RFC 2045). This is too short a block for SIMD
11981     // to be worthwhile, so we use non-SIMD here.
11982     __ movw(rscratch1, 79);
11983 
11984     __ BIND(Process4B);
11985     __ ldrw(r14, __ post(src, 4));
11986     __ ubfxw(r10, r14, 0,  8);
11987     __ ubfxw(r11, r14, 8,  8);
11988     __ ubfxw(r12, r14, 16, 8);
11989     __ ubfxw(r13, r14, 24, 8);
11990     // get the de-code
11991     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
11992     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
11993     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
11994     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
11995     // error detection, 255u indicates an illegal input
11996     __ orrw(r14, r10, r11);
11997     __ orrw(r15, r12, r13);
11998     __ orrw(r14, r14, r15);
11999     __ tbnz(r14, 7, Exit);
12000     // recover the data
12001     __ lslw(r14, r10, 10);
12002     __ bfiw(r14, r11, 4, 6);
12003     __ bfmw(r14, r12, 2, 5);
12004     __ rev16w(r14, r14);
12005     __ bfiw(r13, r12, 6, 2);
12006     __ strh(r14, __ post(dst, 2));
12007     __ strb(r13, __ post(dst, 1));
12008     // non-simd loop
12009     __ subsw(rscratch1, rscratch1, 4);
12010     __ br(Assembler::GT, Process4B);
12011 
12012     // if exiting from PreProcess80B, rscratch1 == -1;
12013     // otherwise, rscratch1 == 0.
12014     __ cbzw(rscratch1, Exit);
12015     __ sub(length, length, 80);
12016 
12017     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
12018     __ cbz(isURL, SIMDEnter);
12019     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
12020 
12021     __ BIND(SIMDEnter);
12022     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
12023     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
12024     __ mov(rscratch1, 63);
12025     __ dup(v27, __ T16B, rscratch1);
12026 
12027     __ BIND(Process64B);
12028     __ cmp(length, (u1)64);
12029     __ br(Assembler::LT, Process32B);
12030     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
12031     __ sub(length, length, 64);
12032     __ b(Process64B);
12033 
12034     __ BIND(Process32B);
12035     __ cmp(length, (u1)32);
12036     __ br(Assembler::LT, SIMDExit);
12037     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
12038     __ sub(length, length, 32);
12039     __ b(Process32B);
12040 
12041     __ BIND(SIMDExit);
12042     __ cbz(length, Exit);
12043     __ movw(rscratch1, length);
12044     __ b(Process4B);
12045 
12046     __ BIND(Exit);
12047     __ sub(c_rarg0, dst, doff);
12048 
12049     __ leave();
12050     __ ret(lr);
12051 
12052     // record the stub entry and end
12053     store_archive_data(stub_id, start, __ pc());
12054 
12055     return start;
12056   }
12057 
12058   // Support for spin waits.
12059   address generate_spin_wait() {
12060     StubId stub_id = StubId::stubgen_spin_wait_id;
12061     int entry_count = StubInfo::entry_count(stub_id);
12062     assert(entry_count == 1, "sanity check");
12063     address start = load_archive_data(stub_id);
12064     if (start != nullptr) {
12065       return start;
12066     }
12067     __ align(CodeEntryAlignment);
12068     StubCodeMark mark(this, stub_id);
12069     start = __ pc();
12070 
12071     __ spin_wait();
12072     __ ret(lr);
12073 
12074     // record the stub entry and end
12075     store_archive_data(stub_id, start, __ pc());
12076 
12077     return start;
12078   }
12079 
12080   void generate_lookup_secondary_supers_table_stub() {
12081     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
12082     GrowableArray<address> entries;
12083     int entry_count = StubInfo::entry_count(stub_id);
12084     assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
12085     address start = load_archive_data(stub_id, &entries);
12086     if (start != nullptr) {
12087       assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
12088              "unexpected extra entry count %d", entries.length());
12089       StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
12090       for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
12091         StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
12092       }
12093       return;
12094     }
12095 
12096     StubCodeMark mark(this, stub_id);
12097 
12098     const Register
12099       r_super_klass  = r0,
12100       r_array_base   = r1,
12101       r_array_length = r2,
12102       r_array_index  = r3,
12103       r_sub_klass    = r4,
12104       r_bitmap       = rscratch2,
12105       result         = r5;
12106     const FloatRegister
12107       vtemp          = v0;
12108 
12109     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
12110       address next_entry = __ pc();
12111       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
12112       if (slot == 0) {
12113         start = next_entry;
12114       } else {
12115         entries.append(next_entry);
12116       }
12117       Label L_success;
12118       __ enter();
12119       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
12120                                              r_array_base, r_array_length, r_array_index,
12121                                              vtemp, result, slot,
12122                                              /*stub_is_near*/true);
12123       __ leave();
12124       __ ret(lr);
12125     }
12126     // record the stub entry and end plus all the auxiliary entries
12127     store_archive_data(stub_id, start, __ pc(), &entries);
12128   }
12129 
12130   // Slow path implementation for UseSecondarySupersTable.
12131   address generate_lookup_secondary_supers_table_slow_path_stub() {
12132     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
12133     int entry_count = StubInfo::entry_count(stub_id);
12134     assert(entry_count == 1, "sanity check");
12135     address start = load_archive_data(stub_id);
12136     if (start != nullptr) {
12137       return start;
12138     }
12139     StubCodeMark mark(this, stub_id);
12140     start = __ pc();
12141     const Register
12142       r_super_klass  = r0,        // argument
12143       r_array_base   = r1,        // argument
12144       temp1          = r2,        // temp
12145       r_array_index  = r3,        // argument
12146       r_bitmap       = rscratch2, // argument
12147       result         = r5;        // argument
12148 
12149     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
12150     __ ret(lr);
12151 
12152     // record the stub entry and end
12153     store_archive_data(stub_id, start, __ pc());
12154 
12155     return start;
12156   }
12157 
12158 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12159 
12160   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
12161   //
12162   // If LSE is in use, generate LSE versions of all the stubs. The
12163   // non-LSE versions are in atomic_aarch64.S.
12164 
12165   // class AtomicStubMark records the entry point of a stub and the
12166   // stub pointer which will point to it. The stub pointer is set to
12167   // the entry point when ~AtomicStubMark() is called, which must be
12168   // after ICache::invalidate_range. This ensures safe publication of
12169   // the generated code.
12170   class AtomicStubMark {
12171     address _entry_point;
12172     aarch64_atomic_stub_t *_stub;
12173     MacroAssembler *_masm;
12174   public:
12175     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
12176       _masm = masm;
12177       __ align(32);
12178       _entry_point = __ pc();
12179       _stub = stub;
12180     }
12181     ~AtomicStubMark() {
12182       *_stub = (aarch64_atomic_stub_t)_entry_point;
12183     }
12184   };
12185 
12186   // NB: For memory_order_conservative we need a trailing membar after
12187   // LSE atomic operations but not a leading membar.
12188   //
12189   // We don't need a leading membar because a clause in the Arm ARM
12190   // says:
12191   //
12192   //   Barrier-ordered-before
12193   //
12194   //   Barrier instructions order prior Memory effects before subsequent
12195   //   Memory effects generated by the same Observer. A read or a write
12196   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
12197   //   Observer if and only if RW1 appears in program order before RW 2
12198   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
12199   //   instruction with both Acquire and Release semantics.
12200   //
12201   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
12202   // and Release semantics, therefore we don't need a leading
12203   // barrier. However, there is no corresponding Barrier-ordered-after
12204   // relationship, therefore we need a trailing membar to prevent a
12205   // later store or load from being reordered with the store in an
12206   // atomic instruction.
12207   //
12208   // This was checked by using the herd7 consistency model simulator
12209   // (http://diy.inria.fr/) with this test case:
12210   //
12211   // AArch64 LseCas
12212   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
12213   // P0 | P1;
12214   // LDR W4, [X2] | MOV W3, #0;
12215   // DMB LD       | MOV W4, #1;
12216   // LDR W3, [X1] | CASAL W3, W4, [X1];
12217   //              | DMB ISH;
12218   //              | STR W4, [X2];
12219   // exists
12220   // (0:X3=0 /\ 0:X4=1)
12221   //
12222   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
12223   // with the store to x in P1. Without the DMB in P1 this may happen.
12224   //
12225   // At the time of writing we don't know of any AArch64 hardware that
12226   // reorders stores in this way, but the Reference Manual permits it.
12227 
12228   void gen_cas_entry(Assembler::operand_size size,
12229                      atomic_memory_order order) {
12230     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
12231       exchange_val = c_rarg2;
12232     bool acquire, release;
12233     switch (order) {
12234       case memory_order_relaxed:
12235         acquire = false;
12236         release = false;
12237         break;
12238       case memory_order_release:
12239         acquire = false;
12240         release = true;
12241         break;
12242       default:
12243         acquire = true;
12244         release = true;
12245         break;
12246     }
12247     __ mov(prev, compare_val);
12248     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
12249     if (order == memory_order_conservative) {
12250       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12251     }
12252     if (size == Assembler::xword) {
12253       __ mov(r0, prev);
12254     } else {
12255       __ movw(r0, prev);
12256     }
12257     __ ret(lr);
12258   }
12259 
12260   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
12261     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
12262     // If not relaxed, then default to conservative.  Relaxed is the only
12263     // case we use enough to be worth specializing.
12264     if (order == memory_order_relaxed) {
12265       __ ldadd(size, incr, prev, addr);
12266     } else {
12267       __ ldaddal(size, incr, prev, addr);
12268       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12269     }
12270     if (size == Assembler::xword) {
12271       __ mov(r0, prev);
12272     } else {
12273       __ movw(r0, prev);
12274     }
12275     __ ret(lr);
12276   }
12277 
12278   void gen_swpal_entry(Assembler::operand_size size) {
12279     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
12280     __ swpal(size, incr, prev, addr);
12281     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12282     if (size == Assembler::xword) {
12283       __ mov(r0, prev);
12284     } else {
12285       __ movw(r0, prev);
12286     }
12287     __ ret(lr);
12288   }
12289 
12290   void generate_atomic_entry_points() {
12291     if (! UseLSE) {
12292       return;
12293     }
12294     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
12295     GrowableArray<address> entries;
12296     int entry_count = StubInfo::entry_count(stub_id);
12297     address start = load_archive_data(stub_id, &entries);
12298     if (start != nullptr) {
12299       assert(entries.length() == entry_count - 1,
12300              "unexpected extra entry count %d", entries.length());
12301       aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
12302       int idx = 0;
12303       aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12304       aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12305       aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12306       aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12307       aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12308       aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12309       aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12310       aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12311       aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12312       aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12313       aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12314       aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12315       aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12316       aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12317       aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12318       assert(idx == entries.length(), "sanity!");
12319       return;
12320     }
12321 
12322     __ align(CodeEntryAlignment);
12323     StubCodeMark mark(this, stub_id);
12324     start = __ pc();
12325     address end;
12326     {
12327     // ADD, memory_order_conservative
12328     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
12329     gen_ldadd_entry(Assembler::word, memory_order_conservative);
12330 
12331     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
12332     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
12333 
12334     // ADD, memory_order_relaxed
12335     AtomicStubMark mark_fetch_add_4_relaxed
12336       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
12337     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
12338 
12339     AtomicStubMark mark_fetch_add_8_relaxed
12340       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
12341     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
12342 
12343     // XCHG, memory_order_conservative
12344     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
12345     gen_swpal_entry(Assembler::word);
12346 
12347     AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
12348     gen_swpal_entry(Assembler::xword);
12349 
12350     // CAS, memory_order_conservative
12351     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
12352     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
12353 
12354     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
12355     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
12356 
12357     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
12358     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
12359 
12360     // CAS, memory_order_relaxed
12361     AtomicStubMark mark_cmpxchg_1_relaxed
12362       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
12363     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
12364 
12365     AtomicStubMark mark_cmpxchg_4_relaxed
12366       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
12367     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
12368 
12369     AtomicStubMark mark_cmpxchg_8_relaxed
12370       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
12371     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
12372 
12373     AtomicStubMark mark_cmpxchg_4_release
12374       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
12375     gen_cas_entry(MacroAssembler::word, memory_order_release);
12376 
12377     AtomicStubMark mark_cmpxchg_8_release
12378       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
12379     gen_cas_entry(MacroAssembler::xword, memory_order_release);
12380 
12381     AtomicStubMark mark_cmpxchg_4_seq_cst
12382       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
12383     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
12384 
12385     AtomicStubMark mark_cmpxchg_8_seq_cst
12386       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
12387     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
12388 
12389     end = __ pc();
12390 
12391     ICache::invalidate_range(start, end - start);
12392     // exit block to force update of AtomicStubMark targets
12393     }
12394 
12395     assert(start == (address)aarch64_atomic_fetch_add_4_impl,
12396            "atomic stub should be at start of buffer");
12397     // record the stub start and end plus all the entries saved by the
12398     // AtomicStubMark destructor
12399     entries.append((address)aarch64_atomic_fetch_add_8_impl);
12400     entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
12401     entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
12402     entries.append((address)aarch64_atomic_xchg_4_impl);
12403     entries.append((address)aarch64_atomic_xchg_8_impl);
12404     entries.append((address)aarch64_atomic_cmpxchg_1_impl);
12405     entries.append((address)aarch64_atomic_cmpxchg_4_impl);
12406     entries.append((address)aarch64_atomic_cmpxchg_8_impl);
12407     entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
12408     entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
12409     entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
12410     entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
12411     entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
12412     entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
12413     entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
12414 
12415     assert(entries.length() == entry_count - 1,
12416            "unexpected extra entry count %d", entries.length());
12417 
12418     store_archive_data(stub_id, start, end, &entries);
12419   }
12420 #endif // LINUX
12421 
12422   static void save_return_registers(MacroAssembler* masm) {
12423     if (InlineTypeReturnedAsFields) {
12424       masm->push(RegSet::range(r0, r7), sp);
12425       masm->sub(sp, sp, 4 * wordSize);
12426       masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
12427       masm->sub(sp, sp, 4 * wordSize);
12428       masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
12429     } else {
12430       masm->fmovd(rscratch1, v0);
12431       masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
12432     }
12433   }
12434 
12435   static void restore_return_registers(MacroAssembler* masm) {
12436     if (InlineTypeReturnedAsFields) {
12437       masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
12438       masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
12439       masm->pop(RegSet::range(r0, r7), sp);
12440     } else {
12441       masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
12442       masm->fmovd(v0, rscratch1);
12443     }
12444   }
12445 
12446   address generate_cont_thaw(Continuation::thaw_kind kind) {
12447     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
12448     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
12449 
12450     address start = __ pc();
12451 
12452     if (return_barrier) {
12453       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
12454       __ mov(sp, rscratch1);
12455     }
12456     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
12457 
12458     if (return_barrier) {
12459       // preserve possible return value from a method returning to the return barrier
12460       save_return_registers(_masm);
12461     }
12462 
12463     __ movw(c_rarg1, (return_barrier ? 1 : 0));
12464     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
12465     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
12466 
12467     if (return_barrier) {
12468       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
12469       restore_return_registers(_masm);
12470     }
12471     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
12472 
12473 
12474     Label thaw_success;
12475     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
12476     __ cbnz(rscratch2, thaw_success);
12477     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
12478     __ br(rscratch1);
12479     __ bind(thaw_success);
12480 
12481     // make room for the thawed frames
12482     __ sub(rscratch1, sp, rscratch2);
12483     __ andr(rscratch1, rscratch1, -16); // align
12484     __ mov(sp, rscratch1);
12485 
12486     if (return_barrier) {
12487       // save original return value -- again
12488       save_return_registers(_masm);
12489     }
12490 
12491     // If we want, we can templatize thaw by kind, and have three different entries
12492     __ movw(c_rarg1, (uint32_t)kind);
12493 
12494     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
12495     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
12496 
12497     if (return_barrier) {
12498       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
12499       restore_return_registers(_masm);
12500     } else {
12501       __ mov(r0, zr); // return 0 (success) from doYield
12502     }
12503 
12504     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
12505     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
12506     __ mov(rfp, sp);
12507 
12508     if (return_barrier_exception) {
12509       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
12510       __ authenticate_return_address(c_rarg1);
12511       __ verify_oop(r0);
12512       // save return value containing the exception oop in callee-saved R19
12513       __ mov(r19, r0);
12514 
12515       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
12516 
12517       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
12518       // __ reinitialize_ptrue();
12519 
12520       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
12521 
12522       __ mov(r1, r0); // the exception handler
12523       __ mov(r0, r19); // restore return value containing the exception oop
12524       __ verify_oop(r0);
12525 
12526       __ leave();
12527       __ mov(r3, lr);
12528       __ br(r1); // the exception handler
12529     } else {
12530       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
12531       __ leave();
12532       __ ret(lr);
12533     }
12534 
12535     return start;
12536   }
12537 
12538   address generate_cont_thaw() {
12539     if (!Continuations::enabled()) return nullptr;
12540 
12541     StubId stub_id = StubId::stubgen_cont_thaw_id;
12542     int entry_count = StubInfo::entry_count(stub_id);
12543     assert(entry_count == 1, "sanity check");
12544     address start = load_archive_data(stub_id);
12545     if (start != nullptr) {
12546       return start;
12547     }
12548     StubCodeMark mark(this, stub_id);
12549     start = __ pc();
12550     generate_cont_thaw(Continuation::thaw_top);
12551 
12552     // record the stub start and end
12553     store_archive_data(stub_id, start, __ pc());
12554 
12555     return start;
12556   }
12557 
12558   address generate_cont_returnBarrier() {
12559     if (!Continuations::enabled()) return nullptr;
12560 
12561     // TODO: will probably need multiple return barriers depending on return type
12562     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
12563     int entry_count = StubInfo::entry_count(stub_id);
12564     assert(entry_count == 1, "sanity check");
12565     address start = load_archive_data(stub_id);
12566     if (start != nullptr) {
12567       return start;
12568     }
12569     StubCodeMark mark(this, stub_id);
12570     start = __ pc();
12571 
12572     generate_cont_thaw(Continuation::thaw_return_barrier);
12573 
12574     // record the stub start and end
12575     store_archive_data(stub_id, start, __ pc());
12576 
12577     return start;
12578   }
12579 
12580   address generate_cont_returnBarrier_exception() {
12581     if (!Continuations::enabled()) return nullptr;
12582 
12583     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
12584     int entry_count = StubInfo::entry_count(stub_id);
12585     assert(entry_count == 1, "sanity check");
12586     address start = load_archive_data(stub_id);
12587     if (start != nullptr) {
12588       return start;
12589     }
12590     StubCodeMark mark(this, stub_id);
12591     start = __ pc();
12592 
12593     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
12594 
12595     // record the stub start and end
12596     store_archive_data(stub_id, start, __ pc());
12597 
12598     return start;
12599   }
12600 
12601   address generate_cont_preempt_stub() {
12602     if (!Continuations::enabled()) return nullptr;
12603     StubId stub_id = StubId::stubgen_cont_preempt_id;
12604     int entry_count = StubInfo::entry_count(stub_id);
12605     assert(entry_count == 1, "sanity check");
12606     address start = load_archive_data(stub_id);
12607     if (start != nullptr) {
12608       return start;
12609     }
12610     StubCodeMark mark(this, stub_id);
12611     start = __ pc();
12612 
12613     __ reset_last_Java_frame(true);
12614 
12615     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
12616     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
12617     __ mov(sp, rscratch2);
12618 
12619     Label preemption_cancelled;
12620     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
12621     __ cbnz(rscratch1, preemption_cancelled);
12622 
12623     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
12624     SharedRuntime::continuation_enter_cleanup(_masm);
12625     __ leave();
12626     __ ret(lr);
12627 
12628     // We acquired the monitor after freezing the frames so call thaw to continue execution.
12629     __ bind(preemption_cancelled);
12630     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
12631     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
12632     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
12633     __ ldr(rscratch1, Address(rscratch1));
12634     __ br(rscratch1);
12635 
12636     // record the stub start and end
12637     store_archive_data(stub_id, start, __ pc());
12638 
12639     return start;
12640   }
12641 
12642   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
12643   // are represented as long[5], with BITS_PER_LIMB = 26.
12644   // Pack five 26-bit limbs into three 64-bit registers.
12645   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
12646     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
12647     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
12648     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
12649     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
12650 
12651     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
12652     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
12653     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
12654     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
12655 
12656     if (dest2->is_valid()) {
12657       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
12658     } else {
12659 #ifdef ASSERT
12660       Label OK;
12661       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
12662       __ br(__ EQ, OK);
12663       __ stop("high bits of Poly1305 integer should be zero");
12664       __ should_not_reach_here();
12665       __ bind(OK);
12666 #endif
12667     }
12668   }
12669 
12670   // As above, but return only a 128-bit integer, packed into two
12671   // 64-bit registers.
12672   void pack_26(Register dest0, Register dest1, Register src) {
12673     pack_26(dest0, dest1, noreg, src);
12674   }
12675 
12676   // Multiply and multiply-accumulate unsigned 64-bit registers.
12677   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
12678     __ mul(prod_lo, n, m);
12679     __ umulh(prod_hi, n, m);
12680   }
12681   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
12682     wide_mul(rscratch1, rscratch2, n, m);
12683     __ adds(sum_lo, sum_lo, rscratch1);
12684     __ adc(sum_hi, sum_hi, rscratch2);
12685   }
12686 
12687   // Poly1305, RFC 7539
12688 
12689   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
12690   // description of the tricks used to simplify and accelerate this
12691   // computation.
12692 
12693   address generate_poly1305_processBlocks() {
12694     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
12695     int entry_count = StubInfo::entry_count(stub_id);
12696     assert(entry_count == 1, "sanity check");
12697     address start = load_archive_data(stub_id);
12698     if (start != nullptr) {
12699       return start;
12700     }
12701     __ align(CodeEntryAlignment);
12702     StubCodeMark mark(this, stub_id);
12703     start = __ pc();
12704     Label here;
12705     __ enter();
12706     RegSet callee_saved = RegSet::range(r19, r28);
12707     __ push(callee_saved, sp);
12708 
12709     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
12710 
12711     // Arguments
12712     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
12713 
12714     // R_n is the 128-bit randomly-generated key, packed into two
12715     // registers.  The caller passes this key to us as long[5], with
12716     // BITS_PER_LIMB = 26.
12717     const Register R_0 = *++regs, R_1 = *++regs;
12718     pack_26(R_0, R_1, r_start);
12719 
12720     // RR_n is (R_n >> 2) * 5
12721     const Register RR_0 = *++regs, RR_1 = *++regs;
12722     __ lsr(RR_0, R_0, 2);
12723     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
12724     __ lsr(RR_1, R_1, 2);
12725     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
12726 
12727     // U_n is the current checksum
12728     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
12729     pack_26(U_0, U_1, U_2, acc_start);
12730 
12731     static constexpr int BLOCK_LENGTH = 16;
12732     Label DONE, LOOP;
12733 
12734     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
12735     __ br(Assembler::LT, DONE); {
12736       __ bind(LOOP);
12737 
12738       // S_n is to be the sum of U_n and the next block of data
12739       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
12740       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
12741       __ adds(S_0, U_0, S_0);
12742       __ adcs(S_1, U_1, S_1);
12743       __ adc(S_2, U_2, zr);
12744       __ add(S_2, S_2, 1);
12745 
12746       const Register U_0HI = *++regs, U_1HI = *++regs;
12747 
12748       // NB: this logic depends on some of the special properties of
12749       // Poly1305 keys. In particular, because we know that the top
12750       // four bits of R_0 and R_1 are zero, we can add together
12751       // partial products without any risk of needing to propagate a
12752       // carry out.
12753       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
12754       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
12755       __ andr(U_2, R_0, 3);
12756       __ mul(U_2, S_2, U_2);
12757 
12758       // Recycle registers S_0, S_1, S_2
12759       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
12760 
12761       // Partial reduction mod 2**130 - 5
12762       __ adds(U_1, U_0HI, U_1);
12763       __ adc(U_2, U_1HI, U_2);
12764       // Sum now in U_2:U_1:U_0.
12765       // Dead: U_0HI, U_1HI.
12766       regs = (regs.remaining() + U_0HI + U_1HI).begin();
12767 
12768       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
12769 
12770       // First, U_2:U_1:U_0 += (U_2 >> 2)
12771       __ lsr(rscratch1, U_2, 2);
12772       __ andr(U_2, U_2, (u8)3);
12773       __ adds(U_0, U_0, rscratch1);
12774       __ adcs(U_1, U_1, zr);
12775       __ adc(U_2, U_2, zr);
12776       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
12777       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
12778       __ adcs(U_1, U_1, zr);
12779       __ adc(U_2, U_2, zr);
12780 
12781       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
12782       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
12783       __ br(~ Assembler::LT, LOOP);
12784     }
12785 
12786     // Further reduce modulo 2^130 - 5
12787     __ lsr(rscratch1, U_2, 2);
12788     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
12789     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
12790     __ adcs(U_1, U_1, zr);
12791     __ andr(U_2, U_2, (u1)3);
12792     __ adc(U_2, U_2, zr);
12793 
12794     // Unpack the sum into five 26-bit limbs and write to memory.
12795     __ ubfiz(rscratch1, U_0, 0, 26);
12796     __ ubfx(rscratch2, U_0, 26, 26);
12797     __ stp(rscratch1, rscratch2, Address(acc_start));
12798     __ ubfx(rscratch1, U_0, 52, 12);
12799     __ bfi(rscratch1, U_1, 12, 14);
12800     __ ubfx(rscratch2, U_1, 14, 26);
12801     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
12802     __ ubfx(rscratch1, U_1, 40, 24);
12803     __ bfi(rscratch1, U_2, 24, 3);
12804     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
12805 
12806     __ bind(DONE);
12807     __ pop(callee_saved, sp);
12808     __ leave();
12809     __ ret(lr);
12810 
12811     // record the stub start and end
12812     store_archive_data(stub_id, start, __ pc());
12813 
12814     return start;
12815   }
12816 
12817   // exception handler for upcall stubs
12818   address generate_upcall_stub_exception_handler() {
12819     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
12820     int entry_count = StubInfo::entry_count(stub_id);
12821     assert(entry_count == 1, "sanity check");
12822     address start = load_archive_data(stub_id);
12823     if (start != nullptr) {
12824       return start;
12825     }
12826     StubCodeMark mark(this, stub_id);
12827     start = __ pc();
12828 
12829     // Native caller has no idea how to handle exceptions,
12830     // so we just crash here. Up to callee to catch exceptions.
12831     __ verify_oop(r0);
12832     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
12833     __ blr(rscratch1);
12834     __ should_not_reach_here();
12835 
12836     // record the stub start and end
12837     store_archive_data(stub_id, start, __ pc());
12838 
12839     return start;
12840   }
12841 
12842   // load Method* target of MethodHandle
12843   // j_rarg0 = jobject receiver
12844   // rmethod = result
12845   address generate_upcall_stub_load_target() {
12846     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
12847     int entry_count = StubInfo::entry_count(stub_id);
12848     assert(entry_count == 1, "sanity check");
12849     address start = load_archive_data(stub_id);
12850     if (start != nullptr) {
12851       return start;
12852     }
12853     StubCodeMark mark(this, stub_id);
12854     start = __ pc();
12855 
12856     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
12857       // Load target method from receiver
12858     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
12859     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
12860     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
12861     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
12862                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
12863                       noreg, noreg);
12864     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
12865 
12866     __ ret(lr);
12867 
12868     // record the stub start and end
12869     store_archive_data(stub_id, start, __ pc());
12870 
12871     return start;
12872   }
12873 
12874 #undef __
12875 #define __ masm->
12876 
12877   class MontgomeryMultiplyGenerator : public MacroAssembler {
12878 
12879     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
12880       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
12881 
12882     RegSet _toSave;
12883     bool _squaring;
12884 
12885   public:
12886     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
12887       : MacroAssembler(as->code()), _squaring(squaring) {
12888 
12889       // Register allocation
12890 
12891       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
12892       Pa_base = *regs;       // Argument registers
12893       if (squaring)
12894         Pb_base = Pa_base;
12895       else
12896         Pb_base = *++regs;
12897       Pn_base = *++regs;
12898       Rlen= *++regs;
12899       inv = *++regs;
12900       Pm_base = *++regs;
12901 
12902                           // Working registers:
12903       Ra =  *++regs;        // The current digit of a, b, n, and m.
12904       Rb =  *++regs;
12905       Rm =  *++regs;
12906       Rn =  *++regs;
12907 
12908       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
12909       Pb =  *++regs;
12910       Pm =  *++regs;
12911       Pn =  *++regs;
12912 
12913       t0 =  *++regs;        // Three registers which form a
12914       t1 =  *++regs;        // triple-precision accumuator.
12915       t2 =  *++regs;
12916 
12917       Ri =  *++regs;        // Inner and outer loop indexes.
12918       Rj =  *++regs;
12919 
12920       Rhi_ab = *++regs;     // Product registers: low and high parts
12921       Rlo_ab = *++regs;     // of a*b and m*n.
12922       Rhi_mn = *++regs;
12923       Rlo_mn = *++regs;
12924 
12925       // r19 and up are callee-saved.
12926       _toSave = RegSet::range(r19, *regs) + Pm_base;
12927     }
12928 
12929   private:
12930     void save_regs() {
12931       push(_toSave, sp);
12932     }
12933 
12934     void restore_regs() {
12935       pop(_toSave, sp);
12936     }
12937 
12938     template <typename T>
12939     void unroll_2(Register count, T block) {
12940       Label loop, end, odd;
12941       tbnz(count, 0, odd);
12942       cbz(count, end);
12943       align(16);
12944       bind(loop);
12945       (this->*block)();
12946       bind(odd);
12947       (this->*block)();
12948       subs(count, count, 2);
12949       br(Assembler::GT, loop);
12950       bind(end);
12951     }
12952 
12953     template <typename T>
12954     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
12955       Label loop, end, odd;
12956       tbnz(count, 0, odd);
12957       cbz(count, end);
12958       align(16);
12959       bind(loop);
12960       (this->*block)(d, s, tmp);
12961       bind(odd);
12962       (this->*block)(d, s, tmp);
12963       subs(count, count, 2);
12964       br(Assembler::GT, loop);
12965       bind(end);
12966     }
12967 
12968     void pre1(RegisterOrConstant i) {
12969       block_comment("pre1");
12970       // Pa = Pa_base;
12971       // Pb = Pb_base + i;
12972       // Pm = Pm_base;
12973       // Pn = Pn_base + i;
12974       // Ra = *Pa;
12975       // Rb = *Pb;
12976       // Rm = *Pm;
12977       // Rn = *Pn;
12978       ldr(Ra, Address(Pa_base));
12979       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
12980       ldr(Rm, Address(Pm_base));
12981       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12982       lea(Pa, Address(Pa_base));
12983       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
12984       lea(Pm, Address(Pm_base));
12985       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12986 
12987       // Zero the m*n result.
12988       mov(Rhi_mn, zr);
12989       mov(Rlo_mn, zr);
12990     }
12991 
12992     // The core multiply-accumulate step of a Montgomery
12993     // multiplication.  The idea is to schedule operations as a
12994     // pipeline so that instructions with long latencies (loads and
12995     // multiplies) have time to complete before their results are
12996     // used.  This most benefits in-order implementations of the
12997     // architecture but out-of-order ones also benefit.
12998     void step() {
12999       block_comment("step");
13000       // MACC(Ra, Rb, t0, t1, t2);
13001       // Ra = *++Pa;
13002       // Rb = *--Pb;
13003       umulh(Rhi_ab, Ra, Rb);
13004       mul(Rlo_ab, Ra, Rb);
13005       ldr(Ra, pre(Pa, wordSize));
13006       ldr(Rb, pre(Pb, -wordSize));
13007       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
13008                                        // previous iteration.
13009       // MACC(Rm, Rn, t0, t1, t2);
13010       // Rm = *++Pm;
13011       // Rn = *--Pn;
13012       umulh(Rhi_mn, Rm, Rn);
13013       mul(Rlo_mn, Rm, Rn);
13014       ldr(Rm, pre(Pm, wordSize));
13015       ldr(Rn, pre(Pn, -wordSize));
13016       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13017     }
13018 
13019     void post1() {
13020       block_comment("post1");
13021 
13022       // MACC(Ra, Rb, t0, t1, t2);
13023       // Ra = *++Pa;
13024       // Rb = *--Pb;
13025       umulh(Rhi_ab, Ra, Rb);
13026       mul(Rlo_ab, Ra, Rb);
13027       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
13028       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13029 
13030       // *Pm = Rm = t0 * inv;
13031       mul(Rm, t0, inv);
13032       str(Rm, Address(Pm));
13033 
13034       // MACC(Rm, Rn, t0, t1, t2);
13035       // t0 = t1; t1 = t2; t2 = 0;
13036       umulh(Rhi_mn, Rm, Rn);
13037 
13038 #ifndef PRODUCT
13039       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
13040       {
13041         mul(Rlo_mn, Rm, Rn);
13042         add(Rlo_mn, t0, Rlo_mn);
13043         Label ok;
13044         cbz(Rlo_mn, ok); {
13045           stop("broken Montgomery multiply");
13046         } bind(ok);
13047       }
13048 #endif
13049       // We have very carefully set things up so that
13050       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
13051       // the lower half of Rm * Rn because we know the result already:
13052       // it must be -t0.  t0 + (-t0) must generate a carry iff
13053       // t0 != 0.  So, rather than do a mul and an adds we just set
13054       // the carry flag iff t0 is nonzero.
13055       //
13056       // mul(Rlo_mn, Rm, Rn);
13057       // adds(zr, t0, Rlo_mn);
13058       subs(zr, t0, 1); // Set carry iff t0 is nonzero
13059       adcs(t0, t1, Rhi_mn);
13060       adc(t1, t2, zr);
13061       mov(t2, zr);
13062     }
13063 
13064     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
13065       block_comment("pre2");
13066       // Pa = Pa_base + i-len;
13067       // Pb = Pb_base + len;
13068       // Pm = Pm_base + i-len;
13069       // Pn = Pn_base + len;
13070 
13071       if (i.is_register()) {
13072         sub(Rj, i.as_register(), len);
13073       } else {
13074         mov(Rj, i.as_constant());
13075         sub(Rj, Rj, len);
13076       }
13077       // Rj == i-len
13078 
13079       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
13080       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
13081       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
13082       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
13083 
13084       // Ra = *++Pa;
13085       // Rb = *--Pb;
13086       // Rm = *++Pm;
13087       // Rn = *--Pn;
13088       ldr(Ra, pre(Pa, wordSize));
13089       ldr(Rb, pre(Pb, -wordSize));
13090       ldr(Rm, pre(Pm, wordSize));
13091       ldr(Rn, pre(Pn, -wordSize));
13092 
13093       mov(Rhi_mn, zr);
13094       mov(Rlo_mn, zr);
13095     }
13096 
13097     void post2(RegisterOrConstant i, RegisterOrConstant len) {
13098       block_comment("post2");
13099       if (i.is_constant()) {
13100         mov(Rj, i.as_constant()-len.as_constant());
13101       } else {
13102         sub(Rj, i.as_register(), len);
13103       }
13104 
13105       adds(t0, t0, Rlo_mn); // The pending m*n, low part
13106 
13107       // As soon as we know the least significant digit of our result,
13108       // store it.
13109       // Pm_base[i-len] = t0;
13110       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
13111 
13112       // t0 = t1; t1 = t2; t2 = 0;
13113       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
13114       adc(t1, t2, zr);
13115       mov(t2, zr);
13116     }
13117 
13118     // A carry in t0 after Montgomery multiplication means that we
13119     // should subtract multiples of n from our result in m.  We'll
13120     // keep doing that until there is no carry.
13121     void normalize(RegisterOrConstant len) {
13122       block_comment("normalize");
13123       // while (t0)
13124       //   t0 = sub(Pm_base, Pn_base, t0, len);
13125       Label loop, post, again;
13126       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
13127       cbz(t0, post); {
13128         bind(again); {
13129           mov(i, zr);
13130           mov(cnt, len);
13131           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13132           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
13133           subs(zr, zr, zr); // set carry flag, i.e. no borrow
13134           align(16);
13135           bind(loop); {
13136             sbcs(Rm, Rm, Rn);
13137             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13138             add(i, i, 1);
13139             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13140             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
13141             sub(cnt, cnt, 1);
13142           } cbnz(cnt, loop);
13143           sbc(t0, t0, zr);
13144         } cbnz(t0, again);
13145       } bind(post);
13146     }
13147 
13148     // Move memory at s to d, reversing words.
13149     //    Increments d to end of copied memory
13150     //    Destroys tmp1, tmp2
13151     //    Preserves len
13152     //    Leaves s pointing to the address which was in d at start
13153     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
13154       assert(tmp1->encoding() < r19->encoding(), "register corruption");
13155       assert(tmp2->encoding() < r19->encoding(), "register corruption");
13156 
13157       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
13158       mov(tmp1, len);
13159       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
13160       sub(s, d, len, ext::uxtw, LogBytesPerWord);
13161     }
13162     // where
13163     void reverse1(Register d, Register s, Register tmp) {
13164       ldr(tmp, pre(s, -wordSize));
13165       ror(tmp, tmp, 32);
13166       str(tmp, post(d, wordSize));
13167     }
13168 
13169     void step_squaring() {
13170       // An extra ACC
13171       step();
13172       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13173     }
13174 
13175     void last_squaring(RegisterOrConstant i) {
13176       Label dont;
13177       // if ((i & 1) == 0) {
13178       tbnz(i.as_register(), 0, dont); {
13179         // MACC(Ra, Rb, t0, t1, t2);
13180         // Ra = *++Pa;
13181         // Rb = *--Pb;
13182         umulh(Rhi_ab, Ra, Rb);
13183         mul(Rlo_ab, Ra, Rb);
13184         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13185       } bind(dont);
13186     }
13187 
13188     void extra_step_squaring() {
13189       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
13190 
13191       // MACC(Rm, Rn, t0, t1, t2);
13192       // Rm = *++Pm;
13193       // Rn = *--Pn;
13194       umulh(Rhi_mn, Rm, Rn);
13195       mul(Rlo_mn, Rm, Rn);
13196       ldr(Rm, pre(Pm, wordSize));
13197       ldr(Rn, pre(Pn, -wordSize));
13198     }
13199 
13200     void post1_squaring() {
13201       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
13202 
13203       // *Pm = Rm = t0 * inv;
13204       mul(Rm, t0, inv);
13205       str(Rm, Address(Pm));
13206 
13207       // MACC(Rm, Rn, t0, t1, t2);
13208       // t0 = t1; t1 = t2; t2 = 0;
13209       umulh(Rhi_mn, Rm, Rn);
13210 
13211 #ifndef PRODUCT
13212       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
13213       {
13214         mul(Rlo_mn, Rm, Rn);
13215         add(Rlo_mn, t0, Rlo_mn);
13216         Label ok;
13217         cbz(Rlo_mn, ok); {
13218           stop("broken Montgomery multiply");
13219         } bind(ok);
13220       }
13221 #endif
13222       // We have very carefully set things up so that
13223       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
13224       // the lower half of Rm * Rn because we know the result already:
13225       // it must be -t0.  t0 + (-t0) must generate a carry iff
13226       // t0 != 0.  So, rather than do a mul and an adds we just set
13227       // the carry flag iff t0 is nonzero.
13228       //
13229       // mul(Rlo_mn, Rm, Rn);
13230       // adds(zr, t0, Rlo_mn);
13231       subs(zr, t0, 1); // Set carry iff t0 is nonzero
13232       adcs(t0, t1, Rhi_mn);
13233       adc(t1, t2, zr);
13234       mov(t2, zr);
13235     }
13236 
13237     void acc(Register Rhi, Register Rlo,
13238              Register t0, Register t1, Register t2) {
13239       adds(t0, t0, Rlo);
13240       adcs(t1, t1, Rhi);
13241       adc(t2, t2, zr);
13242     }
13243 
13244   public:
13245     /**
13246      * Fast Montgomery multiplication.  The derivation of the
13247      * algorithm is in A Cryptographic Library for the Motorola
13248      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
13249      *
13250      * Arguments:
13251      *
13252      * Inputs for multiplication:
13253      *   c_rarg0   - int array elements a
13254      *   c_rarg1   - int array elements b
13255      *   c_rarg2   - int array elements n (the modulus)
13256      *   c_rarg3   - int length
13257      *   c_rarg4   - int inv
13258      *   c_rarg5   - int array elements m (the result)
13259      *
13260      * Inputs for squaring:
13261      *   c_rarg0   - int array elements a
13262      *   c_rarg1   - int array elements n (the modulus)
13263      *   c_rarg2   - int length
13264      *   c_rarg3   - int inv
13265      *   c_rarg4   - int array elements m (the result)
13266      *
13267      */
13268     address generate_multiply() {
13269       Label argh, nothing;
13270 
13271       align(CodeEntryAlignment);
13272       address entry = pc();
13273 
13274       cbzw(Rlen, nothing);
13275 
13276       enter();
13277 
13278       // Make room.
13279       cmpw(Rlen, 512);
13280       br(Assembler::HI, argh);
13281       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
13282       andr(sp, Ra, -2 * wordSize);
13283 
13284       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
13285 
13286       {
13287         // Copy input args, reversing as we go.  We use Ra as a
13288         // temporary variable.
13289         reverse(Ra, Pa_base, Rlen, t0, t1);
13290         if (!_squaring)
13291           reverse(Ra, Pb_base, Rlen, t0, t1);
13292         reverse(Ra, Pn_base, Rlen, t0, t1);
13293       }
13294 
13295       // Push all call-saved registers and also Pm_base which we'll need
13296       // at the end.
13297       save_regs();
13298 
13299 #ifndef PRODUCT
13300       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
13301       {
13302         ldr(Rn, Address(Pn_base, 0));
13303         mul(Rlo_mn, Rn, inv);
13304         subs(zr, Rlo_mn, -1);
13305         Label ok;
13306         br(EQ, ok); {
13307           stop("broken inverse in Montgomery multiply");
13308         } bind(ok);
13309       }
13310 #endif
13311 
13312       mov(Pm_base, Ra);
13313 
13314       mov(t0, zr);
13315       mov(t1, zr);
13316       mov(t2, zr);
13317 
13318       block_comment("for (int i = 0; i < len; i++) {");
13319       mov(Ri, zr); {
13320         Label loop, end;
13321         cmpw(Ri, Rlen);
13322         br(Assembler::GE, end);
13323 
13324         bind(loop);
13325         pre1(Ri);
13326 
13327         block_comment("  for (j = i; j; j--) {"); {
13328           movw(Rj, Ri);
13329           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
13330         } block_comment("  } // j");
13331 
13332         post1();
13333         addw(Ri, Ri, 1);
13334         cmpw(Ri, Rlen);
13335         br(Assembler::LT, loop);
13336         bind(end);
13337         block_comment("} // i");
13338       }
13339 
13340       block_comment("for (int i = len; i < 2*len; i++) {");
13341       mov(Ri, Rlen); {
13342         Label loop, end;
13343         cmpw(Ri, Rlen, Assembler::LSL, 1);
13344         br(Assembler::GE, end);
13345 
13346         bind(loop);
13347         pre2(Ri, Rlen);
13348 
13349         block_comment("  for (j = len*2-i-1; j; j--) {"); {
13350           lslw(Rj, Rlen, 1);
13351           subw(Rj, Rj, Ri);
13352           subw(Rj, Rj, 1);
13353           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
13354         } block_comment("  } // j");
13355 
13356         post2(Ri, Rlen);
13357         addw(Ri, Ri, 1);
13358         cmpw(Ri, Rlen, Assembler::LSL, 1);
13359         br(Assembler::LT, loop);
13360         bind(end);
13361       }
13362       block_comment("} // i");
13363 
13364       normalize(Rlen);
13365 
13366       mov(Ra, Pm_base);  // Save Pm_base in Ra
13367       restore_regs();  // Restore caller's Pm_base
13368 
13369       // Copy our result into caller's Pm_base
13370       reverse(Pm_base, Ra, Rlen, t0, t1);
13371 
13372       leave();
13373       bind(nothing);
13374       ret(lr);
13375 
13376       // handler for error case
13377       bind(argh);
13378       stop("MontgomeryMultiply total_allocation must be <= 8192");
13379 
13380       return entry;
13381     }
13382     // In C, approximately:
13383 
13384     // void
13385     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
13386     //                     julong Pn_base[], julong Pm_base[],
13387     //                     julong inv, int len) {
13388     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
13389     //   julong *Pa, *Pb, *Pn, *Pm;
13390     //   julong Ra, Rb, Rn, Rm;
13391 
13392     //   int i;
13393 
13394     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
13395 
13396     //   for (i = 0; i < len; i++) {
13397     //     int j;
13398 
13399     //     Pa = Pa_base;
13400     //     Pb = Pb_base + i;
13401     //     Pm = Pm_base;
13402     //     Pn = Pn_base + i;
13403 
13404     //     Ra = *Pa;
13405     //     Rb = *Pb;
13406     //     Rm = *Pm;
13407     //     Rn = *Pn;
13408 
13409     //     int iters = i;
13410     //     for (j = 0; iters--; j++) {
13411     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
13412     //       MACC(Ra, Rb, t0, t1, t2);
13413     //       Ra = *++Pa;
13414     //       Rb = *--Pb;
13415     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13416     //       MACC(Rm, Rn, t0, t1, t2);
13417     //       Rm = *++Pm;
13418     //       Rn = *--Pn;
13419     //     }
13420 
13421     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
13422     //     MACC(Ra, Rb, t0, t1, t2);
13423     //     *Pm = Rm = t0 * inv;
13424     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
13425     //     MACC(Rm, Rn, t0, t1, t2);
13426 
13427     //     assert(t0 == 0, "broken Montgomery multiply");
13428 
13429     //     t0 = t1; t1 = t2; t2 = 0;
13430     //   }
13431 
13432     //   for (i = len; i < 2*len; i++) {
13433     //     int j;
13434 
13435     //     Pa = Pa_base + i-len;
13436     //     Pb = Pb_base + len;
13437     //     Pm = Pm_base + i-len;
13438     //     Pn = Pn_base + len;
13439 
13440     //     Ra = *++Pa;
13441     //     Rb = *--Pb;
13442     //     Rm = *++Pm;
13443     //     Rn = *--Pn;
13444 
13445     //     int iters = len*2-i-1;
13446     //     for (j = i-len+1; iters--; j++) {
13447     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
13448     //       MACC(Ra, Rb, t0, t1, t2);
13449     //       Ra = *++Pa;
13450     //       Rb = *--Pb;
13451     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13452     //       MACC(Rm, Rn, t0, t1, t2);
13453     //       Rm = *++Pm;
13454     //       Rn = *--Pn;
13455     //     }
13456 
13457     //     Pm_base[i-len] = t0;
13458     //     t0 = t1; t1 = t2; t2 = 0;
13459     //   }
13460 
13461     //   while (t0)
13462     //     t0 = sub(Pm_base, Pn_base, t0, len);
13463     // }
13464 
13465     /**
13466      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
13467      * multiplies than Montgomery multiplication so it should be up to
13468      * 25% faster.  However, its loop control is more complex and it
13469      * may actually run slower on some machines.
13470      *
13471      * Arguments:
13472      *
13473      * Inputs:
13474      *   c_rarg0   - int array elements a
13475      *   c_rarg1   - int array elements n (the modulus)
13476      *   c_rarg2   - int length
13477      *   c_rarg3   - int inv
13478      *   c_rarg4   - int array elements m (the result)
13479      *
13480      */
13481     address generate_square() {
13482       Label argh;
13483 
13484       align(CodeEntryAlignment);
13485       address entry = pc();
13486 
13487       enter();
13488 
13489       // Make room.
13490       cmpw(Rlen, 512);
13491       br(Assembler::HI, argh);
13492       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
13493       andr(sp, Ra, -2 * wordSize);
13494 
13495       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
13496 
13497       {
13498         // Copy input args, reversing as we go.  We use Ra as a
13499         // temporary variable.
13500         reverse(Ra, Pa_base, Rlen, t0, t1);
13501         reverse(Ra, Pn_base, Rlen, t0, t1);
13502       }
13503 
13504       // Push all call-saved registers and also Pm_base which we'll need
13505       // at the end.
13506       save_regs();
13507 
13508       mov(Pm_base, Ra);
13509 
13510       mov(t0, zr);
13511       mov(t1, zr);
13512       mov(t2, zr);
13513 
13514       block_comment("for (int i = 0; i < len; i++) {");
13515       mov(Ri, zr); {
13516         Label loop, end;
13517         bind(loop);
13518         cmp(Ri, Rlen);
13519         br(Assembler::GE, end);
13520 
13521         pre1(Ri);
13522 
13523         block_comment("for (j = (i+1)/2; j; j--) {"); {
13524           add(Rj, Ri, 1);
13525           lsr(Rj, Rj, 1);
13526           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
13527         } block_comment("  } // j");
13528 
13529         last_squaring(Ri);
13530 
13531         block_comment("  for (j = i/2; j; j--) {"); {
13532           lsr(Rj, Ri, 1);
13533           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
13534         } block_comment("  } // j");
13535 
13536         post1_squaring();
13537         add(Ri, Ri, 1);
13538         cmp(Ri, Rlen);
13539         br(Assembler::LT, loop);
13540 
13541         bind(end);
13542         block_comment("} // i");
13543       }
13544 
13545       block_comment("for (int i = len; i < 2*len; i++) {");
13546       mov(Ri, Rlen); {
13547         Label loop, end;
13548         bind(loop);
13549         cmp(Ri, Rlen, Assembler::LSL, 1);
13550         br(Assembler::GE, end);
13551 
13552         pre2(Ri, Rlen);
13553 
13554         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
13555           lsl(Rj, Rlen, 1);
13556           sub(Rj, Rj, Ri);
13557           sub(Rj, Rj, 1);
13558           lsr(Rj, Rj, 1);
13559           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
13560         } block_comment("  } // j");
13561 
13562         last_squaring(Ri);
13563 
13564         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
13565           lsl(Rj, Rlen, 1);
13566           sub(Rj, Rj, Ri);
13567           lsr(Rj, Rj, 1);
13568           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
13569         } block_comment("  } // j");
13570 
13571         post2(Ri, Rlen);
13572         add(Ri, Ri, 1);
13573         cmp(Ri, Rlen, Assembler::LSL, 1);
13574 
13575         br(Assembler::LT, loop);
13576         bind(end);
13577         block_comment("} // i");
13578       }
13579 
13580       normalize(Rlen);
13581 
13582       mov(Ra, Pm_base);  // Save Pm_base in Ra
13583       restore_regs();  // Restore caller's Pm_base
13584 
13585       // Copy our result into caller's Pm_base
13586       reverse(Pm_base, Ra, Rlen, t0, t1);
13587 
13588       leave();
13589       ret(lr);
13590 
13591       // handler for error case
13592       bind(argh);
13593       stop("MontgomeryMultiply total_allocation must be <= 8192");
13594 
13595       return entry;
13596     }
13597     // In C, approximately:
13598 
13599     // void
13600     // montgomery_square(julong Pa_base[], julong Pn_base[],
13601     //                   julong Pm_base[], julong inv, int len) {
13602     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
13603     //   julong *Pa, *Pb, *Pn, *Pm;
13604     //   julong Ra, Rb, Rn, Rm;
13605 
13606     //   int i;
13607 
13608     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
13609 
13610     //   for (i = 0; i < len; i++) {
13611     //     int j;
13612 
13613     //     Pa = Pa_base;
13614     //     Pb = Pa_base + i;
13615     //     Pm = Pm_base;
13616     //     Pn = Pn_base + i;
13617 
13618     //     Ra = *Pa;
13619     //     Rb = *Pb;
13620     //     Rm = *Pm;
13621     //     Rn = *Pn;
13622 
13623     //     int iters = (i+1)/2;
13624     //     for (j = 0; iters--; j++) {
13625     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
13626     //       MACC2(Ra, Rb, t0, t1, t2);
13627     //       Ra = *++Pa;
13628     //       Rb = *--Pb;
13629     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13630     //       MACC(Rm, Rn, t0, t1, t2);
13631     //       Rm = *++Pm;
13632     //       Rn = *--Pn;
13633     //     }
13634     //     if ((i & 1) == 0) {
13635     //       assert(Ra == Pa_base[j], "must be");
13636     //       MACC(Ra, Ra, t0, t1, t2);
13637     //     }
13638     //     iters = i/2;
13639     //     assert(iters == i-j, "must be");
13640     //     for (; iters--; j++) {
13641     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13642     //       MACC(Rm, Rn, t0, t1, t2);
13643     //       Rm = *++Pm;
13644     //       Rn = *--Pn;
13645     //     }
13646 
13647     //     *Pm = Rm = t0 * inv;
13648     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
13649     //     MACC(Rm, Rn, t0, t1, t2);
13650 
13651     //     assert(t0 == 0, "broken Montgomery multiply");
13652 
13653     //     t0 = t1; t1 = t2; t2 = 0;
13654     //   }
13655 
13656     //   for (i = len; i < 2*len; i++) {
13657     //     int start = i-len+1;
13658     //     int end = start + (len - start)/2;
13659     //     int j;
13660 
13661     //     Pa = Pa_base + i-len;
13662     //     Pb = Pa_base + len;
13663     //     Pm = Pm_base + i-len;
13664     //     Pn = Pn_base + len;
13665 
13666     //     Ra = *++Pa;
13667     //     Rb = *--Pb;
13668     //     Rm = *++Pm;
13669     //     Rn = *--Pn;
13670 
13671     //     int iters = (2*len-i-1)/2;
13672     //     assert(iters == end-start, "must be");
13673     //     for (j = start; iters--; j++) {
13674     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
13675     //       MACC2(Ra, Rb, t0, t1, t2);
13676     //       Ra = *++Pa;
13677     //       Rb = *--Pb;
13678     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13679     //       MACC(Rm, Rn, t0, t1, t2);
13680     //       Rm = *++Pm;
13681     //       Rn = *--Pn;
13682     //     }
13683     //     if ((i & 1) == 0) {
13684     //       assert(Ra == Pa_base[j], "must be");
13685     //       MACC(Ra, Ra, t0, t1, t2);
13686     //     }
13687     //     iters =  (2*len-i)/2;
13688     //     assert(iters == len-j, "must be");
13689     //     for (; iters--; j++) {
13690     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13691     //       MACC(Rm, Rn, t0, t1, t2);
13692     //       Rm = *++Pm;
13693     //       Rn = *--Pn;
13694     //     }
13695     //     Pm_base[i-len] = t0;
13696     //     t0 = t1; t1 = t2; t2 = 0;
13697     //   }
13698 
13699     //   while (t0)
13700     //     t0 = sub(Pm_base, Pn_base, t0, len);
13701     // }
13702   };
13703 
13704   // Call here from the interpreter or compiled code to either load
13705   // multiple returned values from the inline type instance being
13706   // returned to registers or to store returned values to a newly
13707   // allocated inline type instance.
13708   address generate_return_value_stub(address destination, const char* name, bool has_res) {
13709     // We need to save all registers the calling convention may use so
13710     // the runtime calls read or update those registers. This needs to
13711     // be in sync with SharedRuntime::java_return_convention().
13712     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
13713     enum layout {
13714       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
13715       j_rarg6_off, j_rarg6_2,
13716       j_rarg5_off, j_rarg5_2,
13717       j_rarg4_off, j_rarg4_2,
13718       j_rarg3_off, j_rarg3_2,
13719       j_rarg2_off, j_rarg2_2,
13720       j_rarg1_off, j_rarg1_2,
13721       j_rarg0_off, j_rarg0_2,
13722 
13723       j_farg7_off, j_farg7_2,
13724       j_farg6_off, j_farg6_2,
13725       j_farg5_off, j_farg5_2,
13726       j_farg4_off, j_farg4_2,
13727       j_farg3_off, j_farg3_2,
13728       j_farg2_off, j_farg2_2,
13729       j_farg1_off, j_farg1_2,
13730       j_farg0_off, j_farg0_2,
13731 
13732       rfp_off, rfp_off2,
13733       return_off, return_off2,
13734 
13735       framesize // inclusive of return address
13736     };
13737 
13738     CodeBuffer code(name, 512, 64);
13739     MacroAssembler* masm = new MacroAssembler(&code);
13740 
13741     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
13742     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
13743     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
13744     int frame_size_in_words = frame_size_in_bytes / wordSize;
13745 
13746     OopMapSet* oop_maps = new OopMapSet();
13747     OopMap* map = new OopMap(frame_size_in_slots, 0);
13748 
13749     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
13750     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
13751     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
13752     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
13753     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
13754     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
13755     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
13756     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
13757 
13758     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
13759     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
13760     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
13761     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
13762     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
13763     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
13764     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
13765     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
13766 
13767     address start = __ pc();
13768 
13769     __ enter(); // Save FP and LR before call
13770 
13771     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
13772     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
13773     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
13774     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
13775 
13776     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
13777     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
13778     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
13779     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
13780 
13781     int frame_complete = __ offset();
13782 
13783     // Set up last_Java_sp and last_Java_fp
13784     address the_pc = __ pc();
13785     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
13786 
13787     // Call runtime
13788     __ mov(c_rarg1, r0);
13789     __ mov(c_rarg0, rthread);
13790 
13791     __ mov(rscratch1, destination);
13792     __ blr(rscratch1);
13793 
13794     oop_maps->add_gc_map(the_pc - start, map);
13795 
13796     __ reset_last_Java_frame(false);
13797 
13798     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
13799     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
13800     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
13801     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
13802 
13803     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
13804     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
13805     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
13806     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
13807 
13808     // check for pending exceptions
13809     Label pending;
13810     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
13811     __ cbnz(rscratch1, pending);
13812 
13813     if (has_res) {
13814       // We just called SharedRuntime::store_inline_type_fields_to_buf. Check if we still
13815       // need to initialize the buffer and if so, call the inline class specific pack handler.
13816       Label skip_pack;
13817       __ get_vm_result_oop(r0, rthread);
13818       __ get_vm_result_metadata(rscratch1, rthread);
13819       __ cbz(rscratch1, skip_pack);
13820       __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
13821       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_offset()));
13822       __ blr(rscratch1);
13823       __ membar(Assembler::StoreStore);
13824       __ bind(skip_pack);
13825     }
13826 
13827     __ leave();
13828     __ ret(lr);
13829 
13830     __ bind(pending);
13831     __ leave();
13832     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
13833 
13834     // -------------
13835     // make sure all code is generated
13836     masm->flush();
13837 
13838     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
13839     return stub->entry_point();
13840   }
13841 
13842   // Initialization
13843   void generate_preuniverse_stubs() {
13844     // preuniverse stubs are not needed for aarch64
13845   }
13846 
13847   void generate_initial_stubs() {
13848     // Generate initial stubs and initializes the entry points
13849 
13850     // entry points that exist in all platforms Note: This is code
13851     // that could be shared among different platforms - however the
13852     // benefit seems to be smaller than the disadvantage of having a
13853     // much more complicated generator structure. See also comment in
13854     // stubRoutines.hpp.
13855 
13856     StubRoutines::_forward_exception_entry = generate_forward_exception();
13857 
13858     StubRoutines::_call_stub_entry =
13859       generate_call_stub(StubRoutines::_call_stub_return_address);
13860 
13861     // is referenced by megamorphic call
13862     StubRoutines::_catch_exception_entry = generate_catch_exception();
13863 
13864     // Initialize table for copy memory (arraycopy) check.
13865     if (UnsafeMemoryAccess::_table == nullptr) {
13866       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
13867     }
13868 
13869     if (UseCRC32Intrinsics) {
13870       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
13871     }
13872 
13873     if (UseCRC32CIntrinsics) {
13874       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
13875     }
13876 
13877     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
13878       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
13879     }
13880 
13881     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
13882       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
13883     }
13884 
13885     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
13886         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
13887       StubRoutines::_hf2f = generate_float16ToFloat();
13888       StubRoutines::_f2hf = generate_floatToFloat16();
13889     }
13890 
13891     if (InlineTypeReturnedAsFields) {
13892       StubRoutines::_load_inline_type_fields_in_regs =
13893          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
13894       StubRoutines::_store_inline_type_fields_to_buf =
13895          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
13896     }
13897 
13898   }
13899 
13900   void generate_continuation_stubs() {
13901     // Continuation stubs:
13902     StubRoutines::_cont_thaw          = generate_cont_thaw();
13903     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
13904     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
13905     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
13906   }
13907 
13908   void generate_final_stubs() {
13909     // support for verify_oop (must happen after universe_init)
13910     if (VerifyOops) {
13911       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
13912     }
13913 
13914     // arraycopy stubs used by compilers
13915     generate_arraycopy_stubs();
13916 
13917     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
13918 
13919     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
13920 
13921     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
13922     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
13923 
13924 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
13925 
13926     generate_atomic_entry_points();
13927 
13928 #endif // LINUX
13929 
13930 #ifdef COMPILER2
13931     if (UseSecondarySupersTable) {
13932       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
13933       if (! InlineSecondarySupersTest) {
13934         generate_lookup_secondary_supers_table_stub();
13935       }
13936     }
13937 #endif
13938 
13939     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
13940       StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
13941     }
13942 
13943     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
13944   }
13945 
13946   void generate_compiler_stubs() {
13947 #ifdef COMPILER2
13948 
13949     if (UseSVE == 0) {
13950       generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
13951     }
13952 
13953     // array equals stub for large arrays.
13954     if (!UseSimpleArrayEquals) {
13955       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
13956     }
13957 
13958     // arrays_hascode stub for large arrays.
13959     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
13960     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
13961     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
13962     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
13963     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
13964 
13965     // byte_array_inflate stub for large arrays.
13966     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
13967 
13968     // countPositives stub for large arrays.
13969     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
13970 
13971     generate_compare_long_strings();
13972 
13973     generate_string_indexof_stubs();
13974 
13975     if (UseMultiplyToLenIntrinsic) {
13976       StubRoutines::_multiplyToLen = generate_multiplyToLen();
13977     }
13978 
13979     if (UseSquareToLenIntrinsic) {
13980       StubRoutines::_squareToLen = generate_squareToLen();
13981     }
13982 
13983     if (UseMulAddIntrinsic) {
13984       StubRoutines::_mulAdd = generate_mulAdd();
13985     }
13986 
13987     if (UseSIMDForBigIntegerShiftIntrinsics) {
13988       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
13989       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
13990     }
13991 
13992     if (UseMontgomeryMultiplyIntrinsic) {
13993       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
13994       address start = load_archive_data(stub_id);
13995       if (start == nullptr) {
13996         // we have to generate it
13997         StubCodeMark mark(this, stub_id);
13998         MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
13999         start = g.generate_multiply();
14000         // record the stub start and end
14001         store_archive_data(stub_id, start, _masm->pc());
14002       }
14003       StubRoutines::_montgomeryMultiply = start;
14004     }
14005 
14006     if (UseMontgomerySquareIntrinsic) {
14007       StubId stub_id = StubId::stubgen_montgomerySquare_id;
14008       address start = load_archive_data(stub_id);
14009       if (start == nullptr) {
14010         // we have to generate it
14011         StubCodeMark mark(this, stub_id);
14012         MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
14013         // We use generate_multiply() rather than generate_square()
14014         // because it's faster for the sizes of modulus we care about.
14015         start = g.generate_multiply();
14016         // record the stub start and end
14017         store_archive_data(stub_id, start, _masm->pc());
14018       }
14019       StubRoutines::_montgomerySquare = start;
14020     }
14021 
14022     if (UseChaCha20Intrinsics) {
14023       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
14024     }
14025 
14026     if (UseIntPolyIntrinsics) {
14027       StubRoutines::_intpoly_montgomeryMult_P256 = generate_intpoly_montgomeryMult_P256();
14028       StubRoutines::_intpoly_assign = generate_intpoly_assign();
14029     }
14030 
14031     if (UseKyberIntrinsics) {
14032       StubRoutines::_kyberNtt = generate_kyberNtt();
14033       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
14034       StubRoutines::_kyberNttMult = generate_kyberNttMult();
14035       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
14036       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
14037       StubRoutines::_kyber12To16 = generate_kyber12To16();
14038       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
14039     }
14040 
14041     if (UseDilithiumIntrinsics) {
14042       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
14043       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
14044       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
14045       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
14046       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
14047     }
14048 
14049     if (UseBASE64Intrinsics) {
14050         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
14051         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
14052     }
14053 
14054     // data cache line writeback
14055     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
14056     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
14057 
14058     if (UseAESIntrinsics) {
14059       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
14060       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
14061       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
14062       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
14063       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
14064     }
14065     if (UseGHASHIntrinsics) {
14066       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
14067       StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
14068       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
14069     }
14070     if (UseAESIntrinsics && UseGHASHIntrinsics) {
14071       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
14072     }
14073 
14074     if (UseMD5Intrinsics) {
14075       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
14076       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
14077     }
14078     if (UseSHA1Intrinsics) {
14079       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
14080       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
14081     }
14082     if (UseSHA256Intrinsics) {
14083       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
14084       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
14085     }
14086     if (UseSHA512Intrinsics) {
14087       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
14088       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
14089     }
14090     if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
14091       StubRoutines::_double_keccak         = generate_double_keccak();
14092       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
14093       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
14094     } else if (UseSHA3Intrinsics) {
14095       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
14096       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
14097     }
14098 
14099     if (UsePoly1305Intrinsics) {
14100       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
14101     }
14102 
14103     // The difference between AArch64 vs. x86_64 intrinsics implementation
14104     // include the lack of square() intrinsics; usage caused a 3.3% performance
14105     // degradation due to the efficiencies of the symmetric squaring shape in
14106     // Java vs. the inefficiencies of the leaf calls and the additional cycles
14107     // required for 64 bit multiplication in AArch64.
14108     if (UseIntPoly25519Intrinsics) {
14109       StubRoutines::_intpoly_mult_25519 = generate_intpoly_mult_25519();
14110     }
14111 
14112     // generate Adler32 intrinsics code
14113     if (UseAdler32Intrinsics) {
14114       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
14115     }
14116 
14117 #endif // COMPILER2
14118   }
14119 
14120  public:
14121   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
14122     switch(blob_id) {
14123     case BlobId::stubgen_preuniverse_id:
14124       generate_preuniverse_stubs();
14125       break;
14126     case BlobId::stubgen_initial_id:
14127       generate_initial_stubs();
14128       break;
14129      case BlobId::stubgen_continuation_id:
14130       generate_continuation_stubs();
14131       break;
14132     case BlobId::stubgen_compiler_id:
14133       generate_compiler_stubs();
14134       break;
14135     case BlobId::stubgen_final_id:
14136       generate_final_stubs();
14137       break;
14138     default:
14139       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
14140       break;
14141     };
14142   }
14143 
14144 #if INCLUDE_CDS
14145   static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
14146     // external data defined in this file
14147 #define ADD(addr) external_addresses.append((address)(addr));
14148     ADD(_sha256_round_consts);
14149     ADD(_sha512_round_consts);
14150     ADD(_sha3_round_consts);
14151     ADD(_double_keccak_round_consts);
14152     ADD(_modulus_P256);
14153     ADD(_encodeBlock_toBase64);
14154     ADD(_encodeBlock_toBase64URL);
14155     ADD(_decodeBlock_fromBase64ForNoSIMD);
14156     ADD(_decodeBlock_fromBase64URLForNoSIMD);
14157     ADD(_decodeBlock_fromBase64ForSIMD);
14158     ADD(_decodeBlock_fromBase64URLForSIMD);
14159 #undef ADD
14160   }
14161 #endif // INCLUDE_CDS
14162 }; // end class declaration
14163 
14164 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
14165   StubGenerator g(code, blob_id, stub_data);
14166 }
14167 
14168 #if INCLUDE_CDS
14169 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
14170   StubGenerator::init_AOTAddressTable(addresses);
14171 }
14172 #endif // INCLUDE_CDS
14173 
14174 #if defined (LINUX)
14175 
14176 // Define pointers to atomic stubs and initialize them to point to the
14177 // code in atomic_aarch64.S.
14178 
14179 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
14180   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
14181     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
14182   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
14183     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
14184 
14185 DEFAULT_ATOMIC_OP(fetch_add, 4, )
14186 DEFAULT_ATOMIC_OP(fetch_add, 8, )
14187 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
14188 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
14189 DEFAULT_ATOMIC_OP(xchg, 4, )
14190 DEFAULT_ATOMIC_OP(xchg, 8, )
14191 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
14192 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
14193 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
14194 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
14195 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
14196 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
14197 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
14198 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
14199 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
14200 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
14201 
14202 #undef DEFAULT_ATOMIC_OP
14203 
14204 #endif // LINUX