1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Constant data definitions
   83 
   84 static const uint32_t _sha256_round_consts[64] = {
   85   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   86   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   87   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
   88   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
   89   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
   90   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
   91   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
   92   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
   93   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
   94   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
   95   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
   96   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
   97   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
   98   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
   99   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  100   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  101 };
  102 
  103 static const uint64_t _sha512_round_consts[80] = {
  104   0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
  105   0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
  106   0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
  107   0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
  108   0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
  109   0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
  110   0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
  111   0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
  112   0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
  113   0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
  114   0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
  115   0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
  116   0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
  117   0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
  118   0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
  119   0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
  120   0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
  121   0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
  122   0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
  123   0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
  124   0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
  125   0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
  126   0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
  127   0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
  128   0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
  129   0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
  130   0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
  131 };
  132 
  133 static const uint64_t _sha3_round_consts[24] = {
  134   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  135   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  136   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  137   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  138   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  139   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  140   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  141   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  142 };
  143 
  144 static const uint64_t _double_keccak_round_consts[24] = {
  145   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  146   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  147   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  148   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  149   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  150   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  151   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  152   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  153 };
  154 
  155 static const char _encodeBlock_toBase64[64] = {
  156   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  157   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  158   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  159   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  160   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
  161 };
  162 
  163 static const char _encodeBlock_toBase64URL[64] = {
  164   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  165   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  166   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  167   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  168   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
  169 };
  170 
  171 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
  172 // except the trailing character '=' is also treated illegal value in this intrinsic. That
  173 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
  174 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
  175   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  176   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  177   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  178   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  179   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  180   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
  181   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  182   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  183   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  184   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  185   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  186   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  187   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  188   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  189   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  190   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  191 };
  192 
  193 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
  194   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  195   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  196   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  197   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  198   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  199   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
  200   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  201   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  202   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  203   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  204   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  205   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  206   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  207   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  208   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  209   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  210 };
  211 
  212 // A legal value of base64 code is in range [0, 127].  We need two lookups
  213 // with tbl/tbx and combine them to get the decode data. The 1st table vector
  214 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
  215 // table vector lookup use tbx, out of range indices are unchanged in
  216 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
  217 // The value of index 64 is set to 0, so that we know that we already get the
  218 // decoded data with the 1st lookup.
  219 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
  220   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  221   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  222   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  223   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  224   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  225   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  226   255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  227   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  228 };
  229 
  230 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
  231   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  232   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  233   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  234   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  235   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  236   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  237   63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  238   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  239 };
  240 
  241 
  242 // Stub Code definitions
  243 
  244 class StubGenerator: public StubCodeGenerator {
  245  private:
  246 
  247 #ifdef PRODUCT
  248 #define inc_counter_np(counter) ((void)0)
  249 #else
  250   void inc_counter_np_(uint& counter) {
  251     __ incrementw(ExternalAddress((address)&counter));
  252   }
  253 #define inc_counter_np(counter) \
  254   BLOCK_COMMENT("inc_counter " #counter); \
  255   inc_counter_np_(counter);
  256 #endif
  257 
  258   // Call stubs are used to call Java from C
  259   //
  260   // Arguments:
  261   //    c_rarg0:   call wrapper address                   address
  262   //    c_rarg1:   result                                 address
  263   //    c_rarg2:   result type                            BasicType
  264   //    c_rarg3:   method                                 Method*
  265   //    c_rarg4:   (interpreter) entry point              address
  266   //    c_rarg5:   parameters                             intptr_t*
  267   //    c_rarg6:   parameter size (in words)              int
  268   //    c_rarg7:   thread                                 Thread*
  269   //
  270   // There is no return from the stub itself as any Java result
  271   // is written to result
  272   //
  273   // we save r30 (lr) as the return PC at the base of the frame and
  274   // link r29 (fp) below it as the frame pointer installing sp (r31)
  275   // into fp.
  276   //
  277   // we save r0-r7, which accounts for all the c arguments.
  278   //
  279   // TODO: strictly do we need to save them all? they are treated as
  280   // volatile by C so could we omit saving the ones we are going to
  281   // place in global registers (thread? method?) or those we only use
  282   // during setup of the Java call?
  283   //
  284   // we don't need to save r8 which C uses as an indirect result location
  285   // return register.
  286   //
  287   // we don't need to save r9-r15 which both C and Java treat as
  288   // volatile
  289   //
  290   // we don't need to save r16-18 because Java does not use them
  291   //
  292   // we save r19-r28 which Java uses as scratch registers and C
  293   // expects to be callee-save
  294   //
  295   // we save the bottom 64 bits of each value stored in v8-v15; it is
  296   // the responsibility of the caller to preserve larger values.
  297   //
  298   // so the stub frame looks like this when we enter Java code
  299   //
  300   //     [ return_from_Java     ] <--- sp
  301   //     [ argument word n      ]
  302   //      ...
  303   // -29 [ argument word 1      ]
  304   // -28 [ saved Floating-point Control Register ]
  305   // -26 [ saved v15            ] <--- sp_after_call
  306   // -25 [ saved v14            ]
  307   // -24 [ saved v13            ]
  308   // -23 [ saved v12            ]
  309   // -22 [ saved v11            ]
  310   // -21 [ saved v10            ]
  311   // -20 [ saved v9             ]
  312   // -19 [ saved v8             ]
  313   // -18 [ saved r28            ]
  314   // -17 [ saved r27            ]
  315   // -16 [ saved r26            ]
  316   // -15 [ saved r25            ]
  317   // -14 [ saved r24            ]
  318   // -13 [ saved r23            ]
  319   // -12 [ saved r22            ]
  320   // -11 [ saved r21            ]
  321   // -10 [ saved r20            ]
  322   //  -9 [ saved r19            ]
  323   //  -8 [ call wrapper    (r0) ]
  324   //  -7 [ result          (r1) ]
  325   //  -6 [ result type     (r2) ]
  326   //  -5 [ method          (r3) ]
  327   //  -4 [ entry point     (r4) ]
  328   //  -3 [ parameters      (r5) ]
  329   //  -2 [ parameter size  (r6) ]
  330   //  -1 [ thread (r7)          ]
  331   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  332   //   1 [ saved lr       (r30) ]
  333 
  334   // Call stub stack layout word offsets from fp
  335   enum call_stub_layout {
  336     sp_after_call_off  = -28,
  337 
  338     fpcr_off           = sp_after_call_off,
  339     d15_off            = -26,
  340     d13_off            = -24,
  341     d11_off            = -22,
  342     d9_off             = -20,
  343 
  344     r28_off            = -18,
  345     r26_off            = -16,
  346     r24_off            = -14,
  347     r22_off            = -12,
  348     r20_off            = -10,
  349     call_wrapper_off   =  -8,
  350     result_off         =  -7,
  351     result_type_off    =  -6,
  352     method_off         =  -5,
  353     entry_point_off    =  -4,
  354     parameter_size_off =  -2,
  355     thread_off         =  -1,
  356     fp_f               =   0,
  357     retaddr_off        =   1,
  358   };
  359 
  360   address generate_call_stub(address& return_address) {
  361     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  362            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  363            "adjust this code");
  364 
  365     StubId stub_id = StubId::stubgen_call_stub_id;
  366     GrowableArray<address> entries;
  367     int entry_count = StubInfo::entry_count(stub_id);
  368     assert(entry_count == 2, "sanity check");
  369     address start = load_archive_data(stub_id, &entries);
  370     if (start != nullptr) {
  371       assert(entries.length() == 1, "expected 1 extra entry");
  372       return_address = entries.at(0);
  373       return start;
  374     }
  375     StubCodeMark mark(this, stub_id);
  376     start = __ pc();
  377 
  378     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  379 
  380     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  381     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  382     const Address result        (rfp, result_off         * wordSize);
  383     const Address result_type   (rfp, result_type_off    * wordSize);
  384     const Address method        (rfp, method_off         * wordSize);
  385     const Address entry_point   (rfp, entry_point_off    * wordSize);
  386     const Address parameter_size(rfp, parameter_size_off * wordSize);
  387 
  388     const Address thread        (rfp, thread_off         * wordSize);
  389 
  390     const Address d15_save      (rfp, d15_off * wordSize);
  391     const Address d13_save      (rfp, d13_off * wordSize);
  392     const Address d11_save      (rfp, d11_off * wordSize);
  393     const Address d9_save       (rfp, d9_off * wordSize);
  394 
  395     const Address r28_save      (rfp, r28_off * wordSize);
  396     const Address r26_save      (rfp, r26_off * wordSize);
  397     const Address r24_save      (rfp, r24_off * wordSize);
  398     const Address r22_save      (rfp, r22_off * wordSize);
  399     const Address r20_save      (rfp, r20_off * wordSize);
  400 
  401     // stub code
  402 
  403     address aarch64_entry = __ pc();
  404 
  405     // set up frame and move sp to end of save area
  406     __ enter();
  407     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  408 
  409     // save register parameters and Java scratch/global registers
  410     // n.b. we save thread even though it gets installed in
  411     // rthread because we want to sanity check rthread later
  412     __ str(c_rarg7,  thread);
  413     __ strw(c_rarg6, parameter_size);
  414     __ stp(c_rarg4, c_rarg5,  entry_point);
  415     __ stp(c_rarg2, c_rarg3,  result_type);
  416     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  417 
  418     __ stp(r20, r19,   r20_save);
  419     __ stp(r22, r21,   r22_save);
  420     __ stp(r24, r23,   r24_save);
  421     __ stp(r26, r25,   r26_save);
  422     __ stp(r28, r27,   r28_save);
  423 
  424     __ stpd(v9,  v8,   d9_save);
  425     __ stpd(v11, v10,  d11_save);
  426     __ stpd(v13, v12,  d13_save);
  427     __ stpd(v15, v14,  d15_save);
  428 
  429     __ get_fpcr(rscratch1);
  430     __ str(rscratch1, fpcr_save);
  431     // Set FPCR to the state we need. We do want Round to Nearest. We
  432     // don't want non-IEEE rounding modes or floating-point traps.
  433     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  434     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  435     __ set_fpcr(rscratch1);
  436 
  437     // install Java thread in global register now we have saved
  438     // whatever value it held
  439     __ mov(rthread, c_rarg7);
  440     // And method
  441     __ mov(rmethod, c_rarg3);
  442 
  443     // set up the heapbase register
  444     __ reinit_heapbase();
  445 
  446 #ifdef ASSERT
  447     // make sure we have no pending exceptions
  448     {
  449       Label L;
  450       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  451       __ cmp(rscratch1, (u1)NULL_WORD);
  452       __ br(Assembler::EQ, L);
  453       __ stop("StubRoutines::call_stub: entered with pending exception");
  454       __ BIND(L);
  455     }
  456 #endif
  457     // pass parameters if any
  458     __ mov(esp, sp);
  459     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  460     __ andr(sp, rscratch1, -2 * wordSize);
  461 
  462     BLOCK_COMMENT("pass parameters if any");
  463     Label parameters_done;
  464     // parameter count is still in c_rarg6
  465     // and parameter pointer identifying param 1 is in c_rarg5
  466     __ cbzw(c_rarg6, parameters_done);
  467 
  468     address loop = __ pc();
  469     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  470     __ subsw(c_rarg6, c_rarg6, 1);
  471     __ push(rscratch1);
  472     __ br(Assembler::GT, loop);
  473 
  474     __ BIND(parameters_done);
  475 
  476     // call Java entry -- passing methdoOop, and current sp
  477     //      rmethod: Method*
  478     //      r19_sender_sp: sender sp
  479     BLOCK_COMMENT("call Java function");
  480     __ mov(r19_sender_sp, sp);
  481     __ blr(c_rarg4);
  482 
  483     // we do this here because the notify will already have been done
  484     // if we get to the next instruction via an exception
  485     //
  486     // n.b. adding this instruction here affects the calculation of
  487     // whether or not a routine returns to the call stub (used when
  488     // doing stack walks) since the normal test is to check the return
  489     // pc against the address saved below. so we may need to allow for
  490     // this extra instruction in the check.
  491 
  492     // save current address for use by exception handling code
  493 
  494     return_address = __ pc();
  495     entries.append(return_address);
  496 
  497     // store result depending on type (everything that is not
  498     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  499     // n.b. this assumes Java returns an integral result in r0
  500     // and a floating result in j_farg0
  501     __ ldr(j_rarg2, result);
  502     Label is_long, is_float, is_double, exit;
  503     __ ldr(j_rarg1, result_type);
  504     __ cmp(j_rarg1, (u1)T_OBJECT);
  505     __ br(Assembler::EQ, is_long);
  506     __ cmp(j_rarg1, (u1)T_LONG);
  507     __ br(Assembler::EQ, is_long);
  508     __ cmp(j_rarg1, (u1)T_FLOAT);
  509     __ br(Assembler::EQ, is_float);
  510     __ cmp(j_rarg1, (u1)T_DOUBLE);
  511     __ br(Assembler::EQ, is_double);
  512 
  513     // handle T_INT case
  514     __ strw(r0, Address(j_rarg2));
  515 
  516     __ BIND(exit);
  517 
  518     // pop parameters
  519     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  520 
  521 #ifdef ASSERT
  522     // verify that threads correspond
  523     {
  524       Label L, S;
  525       __ ldr(rscratch1, thread);
  526       __ cmp(rthread, rscratch1);
  527       __ br(Assembler::NE, S);
  528       __ get_thread(rscratch1);
  529       __ cmp(rthread, rscratch1);
  530       __ br(Assembler::EQ, L);
  531       __ BIND(S);
  532       __ stop("StubRoutines::call_stub: threads must correspond");
  533       __ BIND(L);
  534     }
  535 #endif
  536 
  537     __ pop_cont_fastpath(rthread);
  538 
  539     // restore callee-save registers
  540     __ ldpd(v15, v14,  d15_save);
  541     __ ldpd(v13, v12,  d13_save);
  542     __ ldpd(v11, v10,  d11_save);
  543     __ ldpd(v9,  v8,   d9_save);
  544 
  545     __ ldp(r28, r27,   r28_save);
  546     __ ldp(r26, r25,   r26_save);
  547     __ ldp(r24, r23,   r24_save);
  548     __ ldp(r22, r21,   r22_save);
  549     __ ldp(r20, r19,   r20_save);
  550 
  551     // restore fpcr
  552     __ ldr(rscratch1,  fpcr_save);
  553     __ set_fpcr(rscratch1);
  554 
  555     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  556     __ ldrw(c_rarg2, result_type);
  557     __ ldr(c_rarg3,  method);
  558     __ ldp(c_rarg4, c_rarg5,  entry_point);
  559     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  560 
  561     // leave frame and return to caller
  562     __ leave();
  563     __ ret(lr);
  564 
  565     // handle return types different from T_INT
  566 
  567     __ BIND(is_long);
  568     __ str(r0, Address(j_rarg2, 0));
  569     __ br(Assembler::AL, exit);
  570 
  571     __ BIND(is_float);
  572     __ strs(j_farg0, Address(j_rarg2, 0));
  573     __ br(Assembler::AL, exit);
  574 
  575     __ BIND(is_double);
  576     __ strd(j_farg0, Address(j_rarg2, 0));
  577     __ br(Assembler::AL, exit);
  578 
  579     // record the stub entry and end plus the auxiliary entry
  580     store_archive_data(stub_id, start, __ pc(), &entries);
  581 
  582     return start;
  583   }
  584 
  585   // Return point for a Java call if there's an exception thrown in
  586   // Java code.  The exception is caught and transformed into a
  587   // pending exception stored in JavaThread that can be tested from
  588   // within the VM.
  589   //
  590   // Note: Usually the parameters are removed by the callee. In case
  591   // of an exception crossing an activation frame boundary, that is
  592   // not the case if the callee is compiled code => need to setup the
  593   // rsp.
  594   //
  595   // r0: exception oop
  596 
  597   address generate_catch_exception() {
  598     StubId stub_id = StubId::stubgen_catch_exception_id;
  599     int entry_count = StubInfo::entry_count(stub_id);
  600     assert(entry_count == 1, "sanity check");
  601     address start = load_archive_data(stub_id);
  602     if (start != nullptr) {
  603       return start;
  604     }
  605     StubCodeMark mark(this, stub_id);
  606     start = __ pc();
  607 
  608     // same as in generate_call_stub():
  609     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  610     const Address thread        (rfp, thread_off         * wordSize);
  611 
  612 #ifdef ASSERT
  613     // verify that threads correspond
  614     {
  615       Label L, S;
  616       __ ldr(rscratch1, thread);
  617       __ cmp(rthread, rscratch1);
  618       __ br(Assembler::NE, S);
  619       __ get_thread(rscratch1);
  620       __ cmp(rthread, rscratch1);
  621       __ br(Assembler::EQ, L);
  622       __ bind(S);
  623       __ stop("StubRoutines::catch_exception: threads must correspond");
  624       __ bind(L);
  625     }
  626 #endif
  627 
  628     // set pending exception
  629     __ verify_oop(r0);
  630 
  631     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  632     // special case -- add file name string to AOT address table
  633     address file = (address)AOTCodeCache::add_C_string(__FILE__);
  634     __ lea(rscratch1, ExternalAddress(file));
  635     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  636     __ movw(rscratch1, (int)__LINE__);
  637     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  638 
  639     // complete return to VM
  640     assert(StubRoutines::_call_stub_return_address != nullptr,
  641            "_call_stub_return_address must have been generated before");
  642     __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
  643 
  644     // record the stub entry and end
  645     store_archive_data(stub_id, start, __ pc());
  646 
  647     return start;
  648   }
  649 
  650   // Continuation point for runtime calls returning with a pending
  651   // exception.  The pending exception check happened in the runtime
  652   // or native call stub.  The pending exception in Thread is
  653   // converted into a Java-level exception.
  654   //
  655   // Contract with Java-level exception handlers:
  656   // r0: exception
  657   // r3: throwing pc
  658   //
  659   // NOTE: At entry of this stub, exception-pc must be in LR !!
  660 
  661   // NOTE: this is always used as a jump target within generated code
  662   // so it just needs to be generated code with no x86 prolog
  663 
  664   address generate_forward_exception() {
  665     StubId stub_id = StubId::stubgen_forward_exception_id;
  666     int entry_count = StubInfo::entry_count(stub_id);
  667     assert(entry_count == 1, "sanity check");
  668     address start = load_archive_data(stub_id);
  669     if (start != nullptr) {
  670       return start;
  671     }
  672     StubCodeMark mark(this, stub_id);
  673     start = __ pc();
  674 
  675     // Upon entry, LR points to the return address returning into
  676     // Java (interpreted or compiled) code; i.e., the return address
  677     // becomes the throwing pc.
  678     //
  679     // Arguments pushed before the runtime call are still on the stack
  680     // but the exception handler will reset the stack pointer ->
  681     // ignore them.  A potential result in registers can be ignored as
  682     // well.
  683 
  684 #ifdef ASSERT
  685     // make sure this code is only executed if there is a pending exception
  686     {
  687       Label L;
  688       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  689       __ cbnz(rscratch1, L);
  690       __ stop("StubRoutines::forward exception: no pending exception (1)");
  691       __ bind(L);
  692     }
  693 #endif
  694 
  695     // compute exception handler into r19
  696 
  697     // call the VM to find the handler address associated with the
  698     // caller address. pass thread in r0 and caller pc (ret address)
  699     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  700     // the stack.
  701     __ mov(c_rarg1, lr);
  702     // lr will be trashed by the VM call so we move it to R19
  703     // (callee-saved) because we also need to pass it to the handler
  704     // returned by this call.
  705     __ mov(r19, lr);
  706     BLOCK_COMMENT("call exception_handler_for_return_address");
  707     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  708                          SharedRuntime::exception_handler_for_return_address),
  709                     rthread, c_rarg1);
  710     // Reinitialize the ptrue predicate register, in case the external runtime
  711     // call clobbers ptrue reg, as we may return to SVE compiled code.
  712     __ reinitialize_ptrue();
  713 
  714     // we should not really care that lr is no longer the callee
  715     // address. we saved the value the handler needs in r19 so we can
  716     // just copy it to r3. however, the C2 handler will push its own
  717     // frame and then calls into the VM and the VM code asserts that
  718     // the PC for the frame above the handler belongs to a compiled
  719     // Java method. So, we restore lr here to satisfy that assert.
  720     __ mov(lr, r19);
  721     // setup r0 & r3 & clear pending exception
  722     __ mov(r3, r19);
  723     __ mov(r19, r0);
  724     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  725     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  726 
  727 #ifdef ASSERT
  728     // make sure exception is set
  729     {
  730       Label L;
  731       __ cbnz(r0, L);
  732       __ stop("StubRoutines::forward exception: no pending exception (2)");
  733       __ bind(L);
  734     }
  735 #endif
  736 
  737     // continue at exception handler
  738     // r0: exception
  739     // r3: throwing pc
  740     // r19: exception handler
  741     __ verify_oop(r0);
  742     __ br(r19);
  743 
  744     // record the stub entry and end
  745     store_archive_data(stub_id, start, __ pc());
  746 
  747     return start;
  748   }
  749 
  750   // Non-destructive plausibility checks for oops
  751   //
  752   // Arguments:
  753   //    r0: oop to verify
  754   //    rscratch1: error message
  755   //
  756   // Stack after saving c_rarg3:
  757   //    [tos + 0]: saved c_rarg3
  758   //    [tos + 1]: saved c_rarg2
  759   //    [tos + 2]: saved lr
  760   //    [tos + 3]: saved rscratch2
  761   //    [tos + 4]: saved r0
  762   //    [tos + 5]: saved rscratch1
  763   address generate_verify_oop() {
  764     StubId stub_id = StubId::stubgen_verify_oop_id;
  765     int entry_count = StubInfo::entry_count(stub_id);
  766     assert(entry_count == 1, "sanity check");
  767     address start = load_archive_data(stub_id);
  768     if (start != nullptr) {
  769       return start;
  770     }
  771     StubCodeMark mark(this, stub_id);
  772     start = __ pc();
  773 
  774     Label exit, error;
  775 
  776     // save c_rarg2 and c_rarg3
  777     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  778 
  779     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  780     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  781     __ ldr(c_rarg3, Address(c_rarg2));
  782     __ add(c_rarg3, c_rarg3, 1);
  783     __ str(c_rarg3, Address(c_rarg2));
  784 
  785     // object is in r0
  786     // make sure object is 'reasonable'
  787     __ cbz(r0, exit); // if obj is null it is OK
  788 
  789     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  790     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  791 
  792     // return if everything seems ok
  793     __ bind(exit);
  794 
  795     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  796     __ ret(lr);
  797 
  798     // handle errors
  799     __ bind(error);
  800     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  801 
  802     __ push(RegSet::range(r0, r29), sp);
  803     // debug(char* msg, int64_t pc, int64_t regs[])
  804     __ mov(c_rarg0, rscratch1);      // pass address of error message
  805     __ mov(c_rarg1, lr);             // pass return address
  806     __ mov(c_rarg2, sp);             // pass address of regs on stack
  807 #ifndef PRODUCT
  808     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  809 #endif
  810     BLOCK_COMMENT("call MacroAssembler::debug");
  811     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  812     __ blr(rscratch1);
  813     __ hlt(0);
  814 
  815     // record the stub entry and end
  816     store_archive_data(stub_id, start, __ pc());
  817 
  818     return start;
  819   }
  820 
  821   // Generate indices for iota vector.
  822   void generate_iota_indices(StubId stub_id) {
  823     GrowableArray<address> entries;
  824     int entry_count = StubInfo::entry_count(stub_id);
  825     assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
  826     address start = load_archive_data(stub_id, &entries);
  827     if (start != nullptr) {
  828       assert(entries.length() == entry_count - 1,
  829              "unexpected entries count %d", entries.length());
  830       StubRoutines::aarch64::_vector_iota_indices[0] = start;
  831       for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  832         StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  833       }
  834       return;
  835     }
  836     __ align(CodeEntryAlignment);
  837     StubCodeMark mark(this, stub_id);
  838     start = __ pc();
  839     // B
  840     __ emit_data64(0x0706050403020100, relocInfo::none);
  841     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  842     entries.append(__ pc());
  843     // H
  844     __ emit_data64(0x0003000200010000, relocInfo::none);
  845     __ emit_data64(0x0007000600050004, relocInfo::none);
  846     entries.append(__ pc());
  847     // S
  848     __ emit_data64(0x0000000100000000, relocInfo::none);
  849     __ emit_data64(0x0000000300000002, relocInfo::none);
  850     entries.append(__ pc());
  851     // D
  852     __ emit_data64(0x0000000000000000, relocInfo::none);
  853     __ emit_data64(0x0000000000000001, relocInfo::none);
  854     entries.append(__ pc());
  855     // S - FP
  856     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  857     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  858     entries.append(__ pc());
  859     // D - FP
  860     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  861     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  862 
  863     // record the stub entry and end
  864     store_archive_data(stub_id, start, __ pc(), &entries);
  865 
  866     // install the entry addresses in the entry array
  867     assert(entries.length() == entry_count - 1,
  868            "unexpected entries count %d", entries.length());
  869     StubRoutines::aarch64::_vector_iota_indices[0] = start;
  870     for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  871       StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  872     }
  873   }
  874 
  875   // The inner part of zero_words().  This is the bulk operation,
  876   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  877   // caller is responsible for zeroing the last few words.
  878   //
  879   // Inputs:
  880   // r10: the HeapWord-aligned base address of an array to zero.
  881   // r11: the count in HeapWords, r11 > 0.
  882   //
  883   // Returns r10 and r11, adjusted for the caller to clear.
  884   // r10: the base address of the tail of words left to clear.
  885   // r11: the number of words in the tail.
  886   //      r11 < MacroAssembler::zero_words_block_size.
  887 
  888   address generate_zero_blocks() {
  889     StubId stub_id = StubId::stubgen_zero_blocks_id;
  890     int entry_count = StubInfo::entry_count(stub_id);
  891     assert(entry_count == 1, "sanity check");
  892     address start = load_archive_data(stub_id);
  893     if (start != nullptr) {
  894       return start;
  895     }
  896     __ align(CodeEntryAlignment);
  897     StubCodeMark mark(this, stub_id);
  898     Label done;
  899     Label base_aligned;
  900 
  901     Register base = r10, cnt = r11;
  902 
  903     start = __ pc();
  904 
  905     if (UseBlockZeroing) {
  906       int zva_length = VM_Version::zva_length();
  907 
  908       // Ensure ZVA length can be divided by 16. This is required by
  909       // the subsequent operations.
  910       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  911 
  912       __ tbz(base, 3, base_aligned);
  913       __ str(zr, Address(__ post(base, 8)));
  914       __ sub(cnt, cnt, 1);
  915       __ bind(base_aligned);
  916 
  917       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  918       // alignment.
  919       Label small;
  920       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  921       __ subs(rscratch1, cnt, low_limit >> 3);
  922       __ br(Assembler::LT, small);
  923       __ zero_dcache_blocks(base, cnt);
  924       __ bind(small);
  925     }
  926 
  927     {
  928       // Number of stp instructions we'll unroll
  929       const int unroll =
  930         MacroAssembler::zero_words_block_size / 2;
  931       // Clear the remaining blocks.
  932       Label loop;
  933       __ subs(cnt, cnt, unroll * 2);
  934       __ br(Assembler::LT, done);
  935       __ bind(loop);
  936       for (int i = 0; i < unroll; i++)
  937         __ stp(zr, zr, __ post(base, 16));
  938       __ subs(cnt, cnt, unroll * 2);
  939       __ br(Assembler::GE, loop);
  940       __ bind(done);
  941       __ add(cnt, cnt, unroll * 2);
  942     }
  943 
  944     __ ret(lr);
  945 
  946     // record the stub entry and end
  947     store_archive_data(stub_id, start, __ pc());
  948 
  949     return start;
  950   }
  951 
  952 
  953   typedef enum {
  954     copy_forwards = 1,
  955     copy_backwards = -1
  956   } copy_direction;
  957 
  958   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  959   // for arraycopy stubs.
  960   class ArrayCopyBarrierSetHelper : StackObj {
  961     BarrierSetAssembler* _bs_asm;
  962     MacroAssembler* _masm;
  963     DecoratorSet _decorators;
  964     BasicType _type;
  965     Register _gct1;
  966     Register _gct2;
  967     Register _gct3;
  968     FloatRegister _gcvt1;
  969     FloatRegister _gcvt2;
  970     FloatRegister _gcvt3;
  971 
  972   public:
  973     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  974                               DecoratorSet decorators,
  975                               BasicType type,
  976                               Register gct1,
  977                               Register gct2,
  978                               Register gct3,
  979                               FloatRegister gcvt1,
  980                               FloatRegister gcvt2,
  981                               FloatRegister gcvt3)
  982       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  983         _masm(masm),
  984         _decorators(decorators),
  985         _type(type),
  986         _gct1(gct1),
  987         _gct2(gct2),
  988         _gct3(gct3),
  989         _gcvt1(gcvt1),
  990         _gcvt2(gcvt2),
  991         _gcvt3(gcvt3) {
  992     }
  993 
  994     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  995       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  996                             dst1, dst2, src,
  997                             _gct1, _gct2, _gcvt1);
  998     }
  999 
 1000     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 1001       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 1002                              dst, src1, src2,
 1003                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 1004     }
 1005 
 1006     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 1007       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 1008                             dst1, dst2, src,
 1009                             _gct1);
 1010     }
 1011 
 1012     void copy_store_at_16(Address dst, Register src1, Register src2) {
 1013       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 1014                              dst, src1, src2,
 1015                              _gct1, _gct2, _gct3);
 1016     }
 1017 
 1018     void copy_load_at_8(Register dst, Address src) {
 1019       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 1020                             dst, noreg, src,
 1021                             _gct1);
 1022     }
 1023 
 1024     void copy_store_at_8(Address dst, Register src) {
 1025       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 1026                              dst, src, noreg,
 1027                              _gct1, _gct2, _gct3);
 1028     }
 1029   };
 1030 
 1031   // Bulk copy of blocks of 8 words.
 1032   //
 1033   // count is a count of words.
 1034   //
 1035   // Precondition: count >= 8
 1036   //
 1037   // Postconditions:
 1038   //
 1039   // The least significant bit of count contains the remaining count
 1040   // of words to copy.  The rest of count is trash.
 1041   //
 1042   // s and d are adjusted to point to the remaining words to copy
 1043   //
 1044   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
 1045     int entry_count = StubInfo::entry_count(stub_id);
 1046     assert(entry_count == 1, "sanity check");
 1047     address start = load_archive_data(stub_id);
 1048     if (start != nullptr) {
 1049       return start;
 1050     }
 1051     BasicType type;
 1052     copy_direction direction;
 1053 
 1054     switch (stub_id) {
 1055     case StubId::stubgen_copy_byte_f_id:
 1056       direction = copy_forwards;
 1057       type = T_BYTE;
 1058       break;
 1059     case StubId::stubgen_copy_byte_b_id:
 1060       direction = copy_backwards;
 1061       type = T_BYTE;
 1062       break;
 1063     case StubId::stubgen_copy_oop_f_id:
 1064       direction = copy_forwards;
 1065       type = T_OBJECT;
 1066       break;
 1067     case StubId::stubgen_copy_oop_b_id:
 1068       direction = copy_backwards;
 1069       type = T_OBJECT;
 1070       break;
 1071     case StubId::stubgen_copy_oop_uninit_f_id:
 1072       direction = copy_forwards;
 1073       type = T_OBJECT;
 1074       break;
 1075     case StubId::stubgen_copy_oop_uninit_b_id:
 1076       direction = copy_backwards;
 1077       type = T_OBJECT;
 1078       break;
 1079     default:
 1080       ShouldNotReachHere();
 1081     }
 1082 
 1083     int unit = wordSize * direction;
 1084     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 1085 
 1086     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 1087       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 1088     const Register stride = r14;
 1089     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1090     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1091     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1092 
 1093     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 1094     assert_different_registers(s, d, count, rscratch1, rscratch2);
 1095 
 1096     Label again, drain;
 1097 
 1098     __ align(CodeEntryAlignment);
 1099 
 1100     StubCodeMark mark(this, stub_id);
 1101 
 1102     start = __ pc();
 1103 
 1104     Label unaligned_copy_long;
 1105     if (AvoidUnalignedAccesses) {
 1106       __ tbnz(d, 3, unaligned_copy_long);
 1107     }
 1108 
 1109     if (direction == copy_forwards) {
 1110       __ sub(s, s, bias);
 1111       __ sub(d, d, bias);
 1112     }
 1113 
 1114 #ifdef ASSERT
 1115     // Make sure we are never given < 8 words
 1116     {
 1117       Label L;
 1118       __ cmp(count, (u1)8);
 1119       __ br(Assembler::GE, L);
 1120       __ stop("genrate_copy_longs called with < 8 words");
 1121       __ bind(L);
 1122     }
 1123 #endif
 1124 
 1125     // Fill 8 registers
 1126     if (UseSIMDForMemoryOps) {
 1127       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1128       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1129     } else {
 1130       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1131       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1132       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1133       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1134     }
 1135 
 1136     __ subs(count, count, 16);
 1137     __ br(Assembler::LO, drain);
 1138 
 1139     int prefetch = PrefetchCopyIntervalInBytes;
 1140     bool use_stride = false;
 1141     if (direction == copy_backwards) {
 1142       use_stride = prefetch > 256;
 1143       prefetch = -prefetch;
 1144       if (use_stride) __ mov(stride, prefetch);
 1145     }
 1146 
 1147     __ bind(again);
 1148 
 1149     if (PrefetchCopyIntervalInBytes > 0)
 1150       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1151 
 1152     if (UseSIMDForMemoryOps) {
 1153       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1154       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1155       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1156       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1157     } else {
 1158       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1159       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1160       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1161       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1162       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1163       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1164       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1165       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1166     }
 1167 
 1168     __ subs(count, count, 8);
 1169     __ br(Assembler::HS, again);
 1170 
 1171     // Drain
 1172     __ bind(drain);
 1173     if (UseSIMDForMemoryOps) {
 1174       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1175       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1176     } else {
 1177       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1178       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1179       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1180       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1181     }
 1182 
 1183     {
 1184       Label L1, L2;
 1185       __ tbz(count, exact_log2(4), L1);
 1186       if (UseSIMDForMemoryOps) {
 1187         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 1188         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 1189       } else {
 1190         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1191         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1192         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1193         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 1194       }
 1195       __ bind(L1);
 1196 
 1197       if (direction == copy_forwards) {
 1198         __ add(s, s, bias);
 1199         __ add(d, d, bias);
 1200       }
 1201 
 1202       __ tbz(count, 1, L2);
 1203       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 1204       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 1205       __ bind(L2);
 1206     }
 1207 
 1208     __ ret(lr);
 1209 
 1210     if (AvoidUnalignedAccesses) {
 1211       Label drain, again;
 1212       // Register order for storing. Order is different for backward copy.
 1213 
 1214       __ bind(unaligned_copy_long);
 1215 
 1216       // source address is even aligned, target odd aligned
 1217       //
 1218       // when forward copying word pairs we read long pairs at offsets
 1219       // {0, 2, 4, 6} (in long words). when backwards copying we read
 1220       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 1221       // address by -2 in the forwards case so we can compute the
 1222       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 1223       // or -1.
 1224       //
 1225       // when forward copying we need to store 1 word, 3 pairs and
 1226       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 1227       // zero offset We adjust the destination by -1 which means we
 1228       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1229       //
 1230       // When backwards copyng we need to store 1 word, 3 pairs and
 1231       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1232       // offsets {1, 3, 5, 7, 8} * unit.
 1233 
 1234       if (direction == copy_forwards) {
 1235         __ sub(s, s, 16);
 1236         __ sub(d, d, 8);
 1237       }
 1238 
 1239       // Fill 8 registers
 1240       //
 1241       // for forwards copy s was offset by -16 from the original input
 1242       // value of s so the register contents are at these offsets
 1243       // relative to the 64 bit block addressed by that original input
 1244       // and so on for each successive 64 byte block when s is updated
 1245       //
 1246       // t0 at offset 0,  t1 at offset 8
 1247       // t2 at offset 16, t3 at offset 24
 1248       // t4 at offset 32, t5 at offset 40
 1249       // t6 at offset 48, t7 at offset 56
 1250 
 1251       // for backwards copy s was not offset so the register contents
 1252       // are at these offsets into the preceding 64 byte block
 1253       // relative to that original input and so on for each successive
 1254       // preceding 64 byte block when s is updated. this explains the
 1255       // slightly counter-intuitive looking pattern of register usage
 1256       // in the stp instructions for backwards copy.
 1257       //
 1258       // t0 at offset -16, t1 at offset -8
 1259       // t2 at offset -32, t3 at offset -24
 1260       // t4 at offset -48, t5 at offset -40
 1261       // t6 at offset -64, t7 at offset -56
 1262 
 1263       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1264       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1265       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1266       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1267 
 1268       __ subs(count, count, 16);
 1269       __ br(Assembler::LO, drain);
 1270 
 1271       int prefetch = PrefetchCopyIntervalInBytes;
 1272       bool use_stride = false;
 1273       if (direction == copy_backwards) {
 1274         use_stride = prefetch > 256;
 1275         prefetch = -prefetch;
 1276         if (use_stride) __ mov(stride, prefetch);
 1277       }
 1278 
 1279       __ bind(again);
 1280 
 1281       if (PrefetchCopyIntervalInBytes > 0)
 1282         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1283 
 1284       if (direction == copy_forwards) {
 1285         // allowing for the offset of -8 the store instructions place
 1286         // registers into the target 64 bit block at the following
 1287         // offsets
 1288         //
 1289         // t0 at offset 0
 1290         // t1 at offset 8,  t2 at offset 16
 1291         // t3 at offset 24, t4 at offset 32
 1292         // t5 at offset 40, t6 at offset 48
 1293         // t7 at offset 56
 1294 
 1295         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1296         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1297         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1298         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1299         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1300         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1301         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1302         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1303         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1304       } else {
 1305         // d was not offset when we started so the registers are
 1306         // written into the 64 bit block preceding d with the following
 1307         // offsets
 1308         //
 1309         // t1 at offset -8
 1310         // t3 at offset -24, t0 at offset -16
 1311         // t5 at offset -48, t2 at offset -32
 1312         // t7 at offset -56, t4 at offset -48
 1313         //                   t6 at offset -64
 1314         //
 1315         // note that this matches the offsets previously noted for the
 1316         // loads
 1317 
 1318         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1319         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1320         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1321         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1322         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1323         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1324         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1325         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1326         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1327       }
 1328 
 1329       __ subs(count, count, 8);
 1330       __ br(Assembler::HS, again);
 1331 
 1332       // Drain
 1333       //
 1334       // this uses the same pattern of offsets and register arguments
 1335       // as above
 1336       __ bind(drain);
 1337       if (direction == copy_forwards) {
 1338         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1339         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1340         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1341         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1342         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1343       } else {
 1344         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1345         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1346         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1347         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1348         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1349       }
 1350       // now we need to copy any remaining part block which may
 1351       // include a 4 word block subblock and/or a 2 word subblock.
 1352       // bits 2 and 1 in the count are the tell-tale for whether we
 1353       // have each such subblock
 1354       {
 1355         Label L1, L2;
 1356         __ tbz(count, exact_log2(4), L1);
 1357         // this is the same as above but copying only 4 longs hence
 1358         // with only one intervening stp between the str instructions
 1359         // but note that the offsets and registers still follow the
 1360         // same pattern
 1361         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1362         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1363         if (direction == copy_forwards) {
 1364           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1365           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1366           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1367         } else {
 1368           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1369           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1370           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1371         }
 1372         __ bind(L1);
 1373 
 1374         __ tbz(count, 1, L2);
 1375         // this is the same as above but copying only 2 longs hence
 1376         // there is no intervening stp between the str instructions
 1377         // but note that the offset and register patterns are still
 1378         // the same
 1379         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1380         if (direction == copy_forwards) {
 1381           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1382           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1383         } else {
 1384           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1385           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1386         }
 1387         __ bind(L2);
 1388 
 1389         // for forwards copy we need to re-adjust the offsets we
 1390         // applied so that s and d are follow the last words written
 1391 
 1392         if (direction == copy_forwards) {
 1393           __ add(s, s, 16);
 1394           __ add(d, d, 8);
 1395         }
 1396 
 1397       }
 1398 
 1399       __ ret(lr);
 1400     }
 1401 
 1402     // record the stub entry and end
 1403     store_archive_data(stub_id, start, __ pc());
 1404 
 1405     return start;
 1406   }
 1407 
 1408   // Small copy: less than 16 bytes.
 1409   //
 1410   // NB: Ignores all of the bits of count which represent more than 15
 1411   // bytes, so a caller doesn't have to mask them.
 1412 
 1413   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1414     bool is_backwards = step < 0;
 1415     size_t granularity = g_uabs(step);
 1416     int direction = is_backwards ? -1 : 1;
 1417 
 1418     Label Lword, Lint, Lshort, Lbyte;
 1419 
 1420     assert(granularity
 1421            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1422 
 1423     const Register t0 = r3;
 1424     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1425     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1426 
 1427     // ??? I don't know if this bit-test-and-branch is the right thing
 1428     // to do.  It does a lot of jumping, resulting in several
 1429     // mispredicted branches.  It might make more sense to do this
 1430     // with something like Duff's device with a single computed branch.
 1431 
 1432     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1433     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1434     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1435     __ bind(Lword);
 1436 
 1437     if (granularity <= sizeof (jint)) {
 1438       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1439       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1440       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1441       __ bind(Lint);
 1442     }
 1443 
 1444     if (granularity <= sizeof (jshort)) {
 1445       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1446       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1447       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1448       __ bind(Lshort);
 1449     }
 1450 
 1451     if (granularity <= sizeof (jbyte)) {
 1452       __ tbz(count, 0, Lbyte);
 1453       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1454       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1455       __ bind(Lbyte);
 1456     }
 1457   }
 1458 
 1459   // All-singing all-dancing memory copy.
 1460   //
 1461   // Copy count units of memory from s to d.  The size of a unit is
 1462   // step, which can be positive or negative depending on the direction
 1463   // of copy.  If is_aligned is false, we align the source address.
 1464   //
 1465 
 1466   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1467                    Register s, Register d, Register count, int step) {
 1468     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1469     bool is_backwards = step < 0;
 1470     unsigned int granularity = g_uabs(step);
 1471     const Register t0 = r3, t1 = r4;
 1472 
 1473     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1474     // load all the data before writing anything
 1475     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1476     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1477     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1478     const Register send = r17, dend = r16;
 1479     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1480     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1481     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1482 
 1483     if (PrefetchCopyIntervalInBytes > 0)
 1484       __ prfm(Address(s, 0), PLDL1KEEP);
 1485     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1486     __ br(Assembler::HI, copy_big);
 1487 
 1488     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1489     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1490 
 1491     __ cmp(count, u1(16/granularity));
 1492     __ br(Assembler::LS, copy16);
 1493 
 1494     __ cmp(count, u1(64/granularity));
 1495     __ br(Assembler::HI, copy80);
 1496 
 1497     __ cmp(count, u1(32/granularity));
 1498     __ br(Assembler::LS, copy32);
 1499 
 1500     // 33..64 bytes
 1501     if (UseSIMDForMemoryOps) {
 1502       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1503       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1504       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1505       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1506     } else {
 1507       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1508       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1509       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1510       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1511 
 1512       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1513       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1514       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1515       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1516     }
 1517     __ b(finish);
 1518 
 1519     // 17..32 bytes
 1520     __ bind(copy32);
 1521     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1522     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1523 
 1524     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1525     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1526     __ b(finish);
 1527 
 1528     // 65..80/96 bytes
 1529     // (96 bytes if SIMD because we do 32 byes per instruction)
 1530     __ bind(copy80);
 1531     if (UseSIMDForMemoryOps) {
 1532       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1533       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1534       // Unaligned pointers can be an issue for copying.
 1535       // The issue has more chances to happen when granularity of data is
 1536       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1537       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1538       // The most performance drop has been seen for the range 65-80 bytes.
 1539       // For such cases using the pair of ldp/stp instead of the third pair of
 1540       // ldpq/stpq fixes the performance issue.
 1541       if (granularity < sizeof (jint)) {
 1542         Label copy96;
 1543         __ cmp(count, u1(80/granularity));
 1544         __ br(Assembler::HI, copy96);
 1545         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1546 
 1547         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1548         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1549 
 1550         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1551         __ b(finish);
 1552 
 1553         __ bind(copy96);
 1554       }
 1555       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1556 
 1557       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1558       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1559 
 1560       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1561     } else {
 1562       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1563       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1564       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1565       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1566       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1567 
 1568       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1569       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1570       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1571       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1572       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1573     }
 1574     __ b(finish);
 1575 
 1576     // 0..16 bytes
 1577     __ bind(copy16);
 1578     __ cmp(count, u1(8/granularity));
 1579     __ br(Assembler::LO, copy8);
 1580 
 1581     // 8..16 bytes
 1582     bs.copy_load_at_8(t0, Address(s, 0));
 1583     bs.copy_load_at_8(t1, Address(send, -8));
 1584     bs.copy_store_at_8(Address(d, 0), t0);
 1585     bs.copy_store_at_8(Address(dend, -8), t1);
 1586     __ b(finish);
 1587 
 1588     if (granularity < 8) {
 1589       // 4..7 bytes
 1590       __ bind(copy8);
 1591       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1592       __ ldrw(t0, Address(s, 0));
 1593       __ ldrw(t1, Address(send, -4));
 1594       __ strw(t0, Address(d, 0));
 1595       __ strw(t1, Address(dend, -4));
 1596       __ b(finish);
 1597       if (granularity < 4) {
 1598         // 0..3 bytes
 1599         __ bind(copy4);
 1600         __ cbz(count, finish); // get rid of 0 case
 1601         if (granularity == 2) {
 1602           __ ldrh(t0, Address(s, 0));
 1603           __ strh(t0, Address(d, 0));
 1604         } else { // granularity == 1
 1605           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1606           // the first and last byte.
 1607           // Handle the 3 byte case by loading and storing base + count/2
 1608           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1609           // This does means in the 1 byte case we load/store the same
 1610           // byte 3 times.
 1611           __ lsr(count, count, 1);
 1612           __ ldrb(t0, Address(s, 0));
 1613           __ ldrb(t1, Address(send, -1));
 1614           __ ldrb(t2, Address(s, count));
 1615           __ strb(t0, Address(d, 0));
 1616           __ strb(t1, Address(dend, -1));
 1617           __ strb(t2, Address(d, count));
 1618         }
 1619         __ b(finish);
 1620       }
 1621     }
 1622 
 1623     __ bind(copy_big);
 1624     if (is_backwards) {
 1625       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1626       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1627     }
 1628 
 1629     // Now we've got the small case out of the way we can align the
 1630     // source address on a 2-word boundary.
 1631 
 1632     // Here we will materialize a count in r15, which is used by copy_memory_small
 1633     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1634     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1635     // can not be used as a temp register, as it contains the count.
 1636 
 1637     Label aligned;
 1638 
 1639     if (is_aligned) {
 1640       // We may have to adjust by 1 word to get s 2-word-aligned.
 1641       __ tbz(s, exact_log2(wordSize), aligned);
 1642       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1643       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1644       __ sub(count, count, wordSize/granularity);
 1645     } else {
 1646       if (is_backwards) {
 1647         __ andr(r15, s, 2 * wordSize - 1);
 1648       } else {
 1649         __ neg(r15, s);
 1650         __ andr(r15, r15, 2 * wordSize - 1);
 1651       }
 1652       // r15 is the byte adjustment needed to align s.
 1653       __ cbz(r15, aligned);
 1654       int shift = exact_log2(granularity);
 1655       if (shift > 0) {
 1656         __ lsr(r15, r15, shift);
 1657       }
 1658       __ sub(count, count, r15);
 1659 
 1660 #if 0
 1661       // ?? This code is only correct for a disjoint copy.  It may or
 1662       // may not make sense to use it in that case.
 1663 
 1664       // Copy the first pair; s and d may not be aligned.
 1665       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1666       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1667 
 1668       // Align s and d, adjust count
 1669       if (is_backwards) {
 1670         __ sub(s, s, r15);
 1671         __ sub(d, d, r15);
 1672       } else {
 1673         __ add(s, s, r15);
 1674         __ add(d, d, r15);
 1675       }
 1676 #else
 1677       copy_memory_small(decorators, type, s, d, r15, step);
 1678 #endif
 1679     }
 1680 
 1681     __ bind(aligned);
 1682 
 1683     // s is now 2-word-aligned.
 1684 
 1685     // We have a count of units and some trailing bytes. Adjust the
 1686     // count and do a bulk copy of words. If the shift is zero
 1687     // perform a move instead to benefit from zero latency moves.
 1688     int shift = exact_log2(wordSize/granularity);
 1689     if (shift > 0) {
 1690       __ lsr(r15, count, shift);
 1691     } else {
 1692       __ mov(r15, count);
 1693     }
 1694     if (direction == copy_forwards) {
 1695       if (type != T_OBJECT) {
 1696         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
 1697         __ blr(rscratch1);
 1698       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1699         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
 1700         __ blr(rscratch1);
 1701       } else {
 1702         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
 1703         __ blr(rscratch1);
 1704       }
 1705     } else {
 1706       if (type != T_OBJECT) {
 1707         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
 1708         __ blr(rscratch1);
 1709       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1710         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
 1711         __ blr(rscratch1);
 1712       } else {
 1713         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
 1714         __ blr(rscratch1);
 1715       }
 1716     }
 1717 
 1718     // And the tail.
 1719     copy_memory_small(decorators, type, s, d, count, step);
 1720 
 1721     if (granularity >= 8) __ bind(copy8);
 1722     if (granularity >= 4) __ bind(copy4);
 1723     __ bind(finish);
 1724   }
 1725 
 1726 
 1727   void clobber_registers() {
 1728 #ifdef ASSERT
 1729     RegSet clobbered
 1730       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1731     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1732     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1733     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1734       __ mov(*it, rscratch1);
 1735     }
 1736 #endif
 1737 
 1738   }
 1739 
 1740   // Scan over array at a for count oops, verifying each one.
 1741   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1742   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1743     Label loop, end;
 1744     __ mov(rscratch1, a);
 1745     __ mov(rscratch2, zr);
 1746     __ bind(loop);
 1747     __ cmp(rscratch2, count);
 1748     __ br(Assembler::HS, end);
 1749     if (size == wordSize) {
 1750       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1751       __ verify_oop(temp);
 1752     } else {
 1753       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1754       __ decode_heap_oop(temp); // calls verify_oop
 1755     }
 1756     __ add(rscratch2, rscratch2, 1);
 1757     __ b(loop);
 1758     __ bind(end);
 1759   }
 1760 
 1761   // Arguments:
 1762   //   stub_id - is used to name the stub and identify all details of
 1763   //             how to perform the copy.
 1764   //
 1765   //   nopush_entry - is assigned to the stub's post push entry point
 1766   //                  unless it is null
 1767   //
 1768   // Inputs:
 1769   //   c_rarg0   - source array address
 1770   //   c_rarg1   - destination array address
 1771   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1772   //
 1773   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1774   // the hardware handle it.  The two dwords within qwords that span
 1775   // cache line boundaries will still be loaded and stored atomically.
 1776   //
 1777   // Side Effects: nopush_entry is set to the (post push) entry point
 1778   //               so it can be used by the corresponding conjoint
 1779   //               copy method
 1780   //
 1781   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1782     int size;
 1783     bool aligned;
 1784     bool is_oop;
 1785     bool dest_uninitialized;
 1786     switch (stub_id) {
 1787     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1788       size = sizeof(jbyte);
 1789       aligned = false;
 1790       is_oop = false;
 1791       dest_uninitialized = false;
 1792       break;
 1793     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1794       size = sizeof(jbyte);
 1795       aligned = true;
 1796       is_oop = false;
 1797       dest_uninitialized = false;
 1798       break;
 1799     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1800       size = sizeof(jshort);
 1801       aligned = false;
 1802       is_oop = false;
 1803       dest_uninitialized = false;
 1804       break;
 1805     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1806       size = sizeof(jshort);
 1807       aligned = true;
 1808       is_oop = false;
 1809       dest_uninitialized = false;
 1810       break;
 1811     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1812       size = sizeof(jint);
 1813       aligned = false;
 1814       is_oop = false;
 1815       dest_uninitialized = false;
 1816       break;
 1817     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1818       size = sizeof(jint);
 1819       aligned = true;
 1820       is_oop = false;
 1821       dest_uninitialized = false;
 1822       break;
 1823     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1824       // since this is always aligned we can (should!) use the same
 1825       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1826       ShouldNotReachHere();
 1827       break;
 1828     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1829       size = sizeof(jlong);
 1830       aligned = true;
 1831       is_oop = false;
 1832       dest_uninitialized = false;
 1833       break;
 1834     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1835       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1836       aligned = !UseCompressedOops;
 1837       is_oop = true;
 1838       dest_uninitialized = false;
 1839       break;
 1840     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1841       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1842       aligned = !UseCompressedOops;
 1843       is_oop = true;
 1844       dest_uninitialized = false;
 1845       break;
 1846     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1847       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1848       aligned = !UseCompressedOops;
 1849       is_oop = true;
 1850       dest_uninitialized = true;
 1851       break;
 1852     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1853       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1854       aligned = !UseCompressedOops;
 1855       is_oop = true;
 1856       dest_uninitialized = true;
 1857       break;
 1858     default:
 1859       ShouldNotReachHere();
 1860       break;
 1861     }
 1862     // all stubs provide a 2nd entry which omits the frame push for
 1863     // use when bailing out from a conjoint copy. However we may also
 1864     // need some extra addressses for memory access protection.
 1865     int entry_count = StubInfo::entry_count(stub_id);
 1866     assert(entry_count == 2, "sanity check");
 1867     assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
 1868 
 1869     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 1870     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 1871     GrowableArray<address> entries;
 1872     GrowableArray<address> extras;
 1873     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 1874     address start = load_archive_data(stub_id, &entries, extras_ptr);
 1875     if (start != nullptr) {
 1876       assert(entries.length() == entry_count - 1,
 1877              "unexpected entries count %d", entries.length());
 1878       *nopush_entry = entries.at(0);
 1879       assert(extras.length() == extra_count,
 1880              "unexpected extra count %d", extras.length());
 1881       if (add_extras) {
 1882         // register one handler at offset 0
 1883         register_unsafe_access_handlers(extras, 0, 1);
 1884       }
 1885       return start;
 1886     }
 1887 
 1888     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1889     RegSet saved_reg = RegSet::of(s, d, count);
 1890 
 1891     __ align(CodeEntryAlignment);
 1892     StubCodeMark mark(this, stub_id);
 1893     start = __ pc();
 1894     __ enter();
 1895 
 1896     *nopush_entry = __ pc();
 1897     entries.append(*nopush_entry);
 1898 
 1899     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1900     BLOCK_COMMENT("Post-Push Entry:");
 1901 
 1902     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1903     if (dest_uninitialized) {
 1904       decorators |= IS_DEST_UNINITIALIZED;
 1905     }
 1906     if (aligned) {
 1907       decorators |= ARRAYCOPY_ALIGNED;
 1908     }
 1909 
 1910     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1911     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1912 
 1913     if (is_oop) {
 1914       // save regs before copy_memory
 1915       __ push(RegSet::of(d, count), sp);
 1916     }
 1917     {
 1918       // UnsafeMemoryAccess page error: continue after unsafe access
 1919       UnsafeMemoryAccessMark umam(this, add_extras, true);
 1920       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1921     }
 1922 
 1923     if (is_oop) {
 1924       __ pop(RegSet::of(d, count), sp);
 1925       if (VerifyOops)
 1926         verify_oop_array(size, d, count, r16);
 1927     }
 1928 
 1929     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1930 
 1931     __ leave();
 1932     __ mov(r0, zr); // return 0
 1933     __ ret(lr);
 1934 
 1935     address end = __ pc();
 1936 
 1937     if (add_extras) {
 1938       // retrieve the registered handler addresses
 1939       retrieve_unsafe_access_handlers(start, end, extras);
 1940       assert(extras.length() == extra_count
 1941              , "incorrect handlers count %d", extras.length());
 1942     }
 1943 
 1944     // record the stub entry and end plus the no_push entry and any
 1945     // extra handler addresses
 1946     store_archive_data(stub_id, start, end, &entries, extras_ptr);
 1947 
 1948     return start;
 1949   }
 1950 
 1951   // Arguments:
 1952   //   stub_id - is used to name the stub and identify all details of
 1953   //             how to perform the copy.
 1954   //
 1955   //   nooverlap_target - identifes the (post push) entry for the
 1956   //             corresponding disjoint copy routine which can be
 1957   //             jumped to if the ranges do not actually overlap
 1958   //
 1959   //   nopush_entry - is assigned to the stub's post push entry point
 1960   //                  unless it is null
 1961   //
 1962   //
 1963   // Inputs:
 1964   //   c_rarg0   - source array address
 1965   //   c_rarg1   - destination array address
 1966   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1967   //
 1968   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1969   // the hardware handle it.  The two dwords within qwords that span
 1970   // cache line boundaries will still be loaded and stored atomically.
 1971   //
 1972   // Side Effects:
 1973   //   nopush_entry is set to the no-overlap entry point so it can be
 1974   //   used by some other conjoint copy method
 1975   //
 1976   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1977     int size;
 1978     bool aligned;
 1979     bool is_oop;
 1980     bool dest_uninitialized;
 1981     switch (stub_id) {
 1982     case StubId::stubgen_jbyte_arraycopy_id:
 1983       size = sizeof(jbyte);
 1984       aligned = false;
 1985       is_oop = false;
 1986       dest_uninitialized = false;
 1987       break;
 1988     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1989       size = sizeof(jbyte);
 1990       aligned = true;
 1991       is_oop = false;
 1992       dest_uninitialized = false;
 1993       break;
 1994     case StubId::stubgen_jshort_arraycopy_id:
 1995       size = sizeof(jshort);
 1996       aligned = false;
 1997       is_oop = false;
 1998       dest_uninitialized = false;
 1999       break;
 2000     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 2001       size = sizeof(jshort);
 2002       aligned = true;
 2003       is_oop = false;
 2004       dest_uninitialized = false;
 2005       break;
 2006     case StubId::stubgen_jint_arraycopy_id:
 2007       size = sizeof(jint);
 2008       aligned = false;
 2009       is_oop = false;
 2010       dest_uninitialized = false;
 2011       break;
 2012     case StubId::stubgen_arrayof_jint_arraycopy_id:
 2013       size = sizeof(jint);
 2014       aligned = true;
 2015       is_oop = false;
 2016       dest_uninitialized = false;
 2017       break;
 2018     case StubId::stubgen_jlong_arraycopy_id:
 2019       // since this is always aligned we can (should!) use the same
 2020       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 2021       ShouldNotReachHere();
 2022       break;
 2023     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 2024       size = sizeof(jlong);
 2025       aligned = true;
 2026       is_oop = false;
 2027       dest_uninitialized = false;
 2028       break;
 2029     case StubId::stubgen_oop_arraycopy_id:
 2030       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2031       aligned = !UseCompressedOops;
 2032       is_oop = true;
 2033       dest_uninitialized = false;
 2034       break;
 2035     case StubId::stubgen_arrayof_oop_arraycopy_id:
 2036       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2037       aligned = !UseCompressedOops;
 2038       is_oop = true;
 2039       dest_uninitialized = false;
 2040       break;
 2041     case StubId::stubgen_oop_arraycopy_uninit_id:
 2042       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2043       aligned = !UseCompressedOops;
 2044       is_oop = true;
 2045       dest_uninitialized = true;
 2046       break;
 2047     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 2048       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2049       aligned = !UseCompressedOops;
 2050       is_oop = true;
 2051       dest_uninitialized = true;
 2052       break;
 2053     default:
 2054       ShouldNotReachHere();
 2055     }
 2056     // only some conjoint stubs generate a 2nd entry
 2057     int entry_count = StubInfo::entry_count(stub_id);
 2058     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2059     assert(entry_count == expected_entry_count,
 2060            "expected entry count %d does not match declared entry count %d for stub %s",
 2061            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2062 
 2063     // We need to protect memory accesses in certain cases
 2064     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 2065     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 2066     GrowableArray<address> entries;
 2067     GrowableArray<address> extras;
 2068     GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
 2069     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 2070     address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 2071     if (start != nullptr) {
 2072       assert(entries.length() == expected_entry_count - 1,
 2073              "unexpected entries count %d", entries.length());
 2074       assert(extras.length() == extra_count,
 2075              "unexpected extra count %d", extras.length());
 2076       if (nopush_entry != nullptr) {
 2077         *nopush_entry = entries.at(0);
 2078       }
 2079       if (add_extras) {
 2080         // register one handler at offset 0
 2081         register_unsafe_access_handlers(extras, 0, 1);
 2082       }
 2083       return start;
 2084     }
 2085 
 2086     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2087     RegSet saved_regs = RegSet::of(s, d, count);
 2088     StubCodeMark mark(this, stub_id);
 2089     start = __ pc();
 2090     __ enter();
 2091 
 2092     if (nopush_entry != nullptr) {
 2093       *nopush_entry = __ pc();
 2094       entries.append(*nopush_entry);
 2095       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 2096       BLOCK_COMMENT("Post-Push Entry:");
 2097     }
 2098 
 2099     // use fwd copy when (d-s) above_equal (count*size)
 2100     Label L_overlapping;
 2101     __ sub(rscratch1, d, s);
 2102     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 2103     __ br(Assembler::LO, L_overlapping);
 2104     __ b(RuntimeAddress(nooverlap_target));
 2105     __ bind(L_overlapping);
 2106 
 2107     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 2108     if (dest_uninitialized) {
 2109       decorators |= IS_DEST_UNINITIALIZED;
 2110     }
 2111     if (aligned) {
 2112       decorators |= ARRAYCOPY_ALIGNED;
 2113     }
 2114 
 2115     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2116     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 2117 
 2118     if (is_oop) {
 2119       // save regs before copy_memory
 2120       __ push(RegSet::of(d, count), sp);
 2121     }
 2122     {
 2123       // UnsafeMemoryAccess page error: continue after unsafe access
 2124       UnsafeMemoryAccessMark umam(this, add_extras, true);
 2125       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 2126     }
 2127     if (is_oop) {
 2128       __ pop(RegSet::of(d, count), sp);
 2129       if (VerifyOops)
 2130         verify_oop_array(size, d, count, r16);
 2131     }
 2132     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 2133     __ leave();
 2134     __ mov(r0, zr); // return 0
 2135     __ ret(lr);
 2136 
 2137     assert(entries.length() == expected_entry_count - 1,
 2138            "unexpected entries count %d", entries.length());
 2139 
 2140     address end = __ pc();
 2141 
 2142     if (add_extras) {
 2143       // retrieve the registered handler addresses
 2144       retrieve_unsafe_access_handlers(start, end, extras);
 2145       assert(extras.length() == extra_count,
 2146              "incorrect handlers count %d", extras.length());
 2147     }
 2148 
 2149     // record the stub entry and end plus any no_push entry and/or
 2150     // extra handler addresses
 2151     store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
 2152 
 2153     return start;
 2154   }
 2155 
 2156   // Helper for generating a dynamic type check.
 2157   // Smashes rscratch1, rscratch2.
 2158   void generate_type_check(Register sub_klass,
 2159                            Register super_check_offset,
 2160                            Register super_klass,
 2161                            Register temp1,
 2162                            Register temp2,
 2163                            Register result,
 2164                            Label& L_success) {
 2165     assert_different_registers(sub_klass, super_check_offset, super_klass);
 2166 
 2167     BLOCK_COMMENT("type_check:");
 2168 
 2169     Label L_miss;
 2170 
 2171     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 2172                                      super_check_offset);
 2173     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 2174 
 2175     // Fall through on failure!
 2176     __ BIND(L_miss);
 2177   }
 2178 
 2179   //
 2180   //  Generate checkcasting array copy stub
 2181   //
 2182   //  Input:
 2183   //    c_rarg0   - source array address
 2184   //    c_rarg1   - destination array address
 2185   //    c_rarg2   - element count, treated as ssize_t, can be zero
 2186   //    c_rarg3   - size_t ckoff (super_check_offset)
 2187   //    c_rarg4   - oop ckval (super_klass)
 2188   //
 2189   //  Output:
 2190   //    r0 ==  0  -  success
 2191   //    r0 == -1^K - failure, where K is partial transfer count
 2192   //
 2193   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 2194     bool dest_uninitialized;
 2195     switch (stub_id) {
 2196     case StubId::stubgen_checkcast_arraycopy_id:
 2197       dest_uninitialized = false;
 2198       break;
 2199     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 2200       dest_uninitialized = true;
 2201       break;
 2202     default:
 2203       ShouldNotReachHere();
 2204     }
 2205 
 2206     // The normal stub provides a 2nd entry which omits the frame push
 2207     // for use when bailing out from a disjoint copy.
 2208     // Only some conjoint stubs generate a 2nd entry
 2209     int entry_count = StubInfo::entry_count(stub_id);
 2210     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2211     GrowableArray<address> entries;
 2212     GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
 2213     assert(entry_count == expected_entry_count,
 2214            "expected entry count %d does not match declared entry count %d for stub %s",
 2215            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2216     address start = load_archive_data(stub_id, entries_ptr);
 2217     if (start != nullptr) {
 2218       assert(entries.length() + 1 == expected_entry_count,
 2219              "expected entry count %d does not match return entry count %d for stub %s",
 2220              expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
 2221       if (nopush_entry != nullptr) {
 2222         *nopush_entry = entries.at(0);
 2223       }
 2224       return start;
 2225     }
 2226 
 2227     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 2228 
 2229     // Input registers (after setup_arg_regs)
 2230     const Register from        = c_rarg0;   // source array address
 2231     const Register to          = c_rarg1;   // destination array address
 2232     const Register count       = c_rarg2;   // elementscount
 2233     const Register ckoff       = c_rarg3;   // super_check_offset
 2234     const Register ckval       = c_rarg4;   // super_klass
 2235 
 2236     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 2237 
 2238     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 2239     const Register copied_oop  = r22;       // actual oop copied
 2240     const Register count_save  = r21;       // orig elementscount
 2241     const Register start_to    = r20;       // destination array start address
 2242     const Register r19_klass   = r19;       // oop._klass
 2243 
 2244     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 2245     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 2246 
 2247     //---------------------------------------------------------------
 2248     // Assembler stub will be used for this call to arraycopy
 2249     // if the two arrays are subtypes of Object[] but the
 2250     // destination array type is not equal to or a supertype
 2251     // of the source type.  Each element must be separately
 2252     // checked.
 2253 
 2254     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 2255                                copied_oop, r19_klass, count_save);
 2256 
 2257     __ align(CodeEntryAlignment);
 2258     StubCodeMark mark(this, stub_id);
 2259     start = __ pc();
 2260 
 2261     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2262 
 2263 #ifdef ASSERT
 2264     // caller guarantees that the arrays really are different
 2265     // otherwise, we would have to make conjoint checks
 2266     { Label L;
 2267       __ b(L);                  // conjoint check not yet implemented
 2268       __ stop("checkcast_copy within a single array");
 2269       __ bind(L);
 2270     }
 2271 #endif //ASSERT
 2272 
 2273     // Caller of this entry point must set up the argument registers.
 2274     if (nopush_entry != nullptr) {
 2275       *nopush_entry = __ pc();
 2276       entries.append(*nopush_entry);
 2277       BLOCK_COMMENT("Entry:");
 2278     }
 2279 
 2280      // Empty array:  Nothing to do.
 2281     __ cbz(count, L_done);
 2282     __ push(RegSet::of(r19, r20, r21, r22), sp);
 2283 
 2284 #ifdef ASSERT
 2285     BLOCK_COMMENT("assert consistent ckoff/ckval");
 2286     // The ckoff and ckval must be mutually consistent,
 2287     // even though caller generates both.
 2288     { Label L;
 2289       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2290       __ ldrw(start_to, Address(ckval, sco_offset));
 2291       __ cmpw(ckoff, start_to);
 2292       __ br(Assembler::EQ, L);
 2293       __ stop("super_check_offset inconsistent");
 2294       __ bind(L);
 2295     }
 2296 #endif //ASSERT
 2297 
 2298     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 2299     bool is_oop = true;
 2300     int element_size = UseCompressedOops ? 4 : 8;
 2301     if (dest_uninitialized) {
 2302       decorators |= IS_DEST_UNINITIALIZED;
 2303     }
 2304 
 2305     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2306     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 2307 
 2308     // save the original count
 2309     __ mov(count_save, count);
 2310 
 2311     // Copy from low to high addresses
 2312     __ mov(start_to, to);              // Save destination array start address
 2313     __ b(L_load_element);
 2314 
 2315     // ======== begin loop ========
 2316     // (Loop is rotated; its entry is L_load_element.)
 2317     // Loop control:
 2318     //   for (; count != 0; count--) {
 2319     //     copied_oop = load_heap_oop(from++);
 2320     //     ... generate_type_check ...;
 2321     //     store_heap_oop(to++, copied_oop);
 2322     //   }
 2323     __ align(OptoLoopAlignment);
 2324 
 2325     __ BIND(L_store_element);
 2326     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 2327                       __ post(to, element_size), copied_oop, noreg,
 2328                       gct1, gct2, gct3);
 2329     __ sub(count, count, 1);
 2330     __ cbz(count, L_do_card_marks);
 2331 
 2332     // ======== loop entry is here ========
 2333     __ BIND(L_load_element);
 2334     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 2335                      copied_oop, noreg, __ post(from, element_size),
 2336                      gct1);
 2337     __ cbz(copied_oop, L_store_element);
 2338 
 2339     __ load_klass(r19_klass, copied_oop);// query the object klass
 2340 
 2341     BLOCK_COMMENT("type_check:");
 2342     generate_type_check(/*sub_klass*/r19_klass,
 2343                         /*super_check_offset*/ckoff,
 2344                         /*super_klass*/ckval,
 2345                         /*r_array_base*/gct1,
 2346                         /*temp2*/gct2,
 2347                         /*result*/r10, L_store_element);
 2348 
 2349     // Fall through on failure!
 2350 
 2351     // ======== end loop ========
 2352 
 2353     // It was a real error; we must depend on the caller to finish the job.
 2354     // Register count = remaining oops, count_orig = total oops.
 2355     // Emit GC store barriers for the oops we have copied and report
 2356     // their number to the caller.
 2357 
 2358     __ subs(count, count_save, count);     // K = partially copied oop count
 2359     __ eon(count, count, zr);              // report (-1^K) to caller
 2360     __ br(Assembler::EQ, L_done_pop);
 2361 
 2362     __ BIND(L_do_card_marks);
 2363     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2364 
 2365     __ bind(L_done_pop);
 2366     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2367     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2368 
 2369     __ bind(L_done);
 2370     __ mov(r0, count);
 2371     __ leave();
 2372     __ ret(lr);
 2373 
 2374     // record the stub entry and end plus any no_push entry
 2375     store_archive_data(stub_id, start, __ pc() , entries_ptr);
 2376     return start;
 2377   }
 2378 
 2379   // Perform range checks on the proposed arraycopy.
 2380   // Kills temp, but nothing else.
 2381   // Also, clean the sign bits of src_pos and dst_pos.
 2382   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2383                               Register src_pos, // source position (c_rarg1)
 2384                               Register dst,     // destination array oo (c_rarg2)
 2385                               Register dst_pos, // destination position (c_rarg3)
 2386                               Register length,
 2387                               Register temp,
 2388                               Label& L_failed) {
 2389     BLOCK_COMMENT("arraycopy_range_checks:");
 2390 
 2391     assert_different_registers(rscratch1, temp);
 2392 
 2393     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2394     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2395     __ addw(temp, length, src_pos);
 2396     __ cmpw(temp, rscratch1);
 2397     __ br(Assembler::HI, L_failed);
 2398 
 2399     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2400     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2401     __ addw(temp, length, dst_pos);
 2402     __ cmpw(temp, rscratch1);
 2403     __ br(Assembler::HI, L_failed);
 2404 
 2405     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2406     __ movw(src_pos, src_pos);
 2407     __ movw(dst_pos, dst_pos);
 2408 
 2409     BLOCK_COMMENT("arraycopy_range_checks done");
 2410   }
 2411 
 2412   // These stubs get called from some dumb test routine.
 2413   // I'll write them properly when they're called from
 2414   // something that's actually doing something.
 2415   static void fake_arraycopy_stub(address src, address dst, int count) {
 2416     assert(count == 0, "huh?");
 2417   }
 2418 
 2419 
 2420   //
 2421   //  Generate 'unsafe' array copy stub
 2422   //  Though just as safe as the other stubs, it takes an unscaled
 2423   //  size_t argument instead of an element count.
 2424   //
 2425   //  Input:
 2426   //    c_rarg0   - source array address
 2427   //    c_rarg1   - destination array address
 2428   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2429   //
 2430   // Examines the alignment of the operands and dispatches
 2431   // to a long, int, short, or byte copy loop.
 2432   //
 2433   address generate_unsafe_copy(address byte_copy_entry,
 2434                                address short_copy_entry,
 2435                                address int_copy_entry,
 2436                                address long_copy_entry) {
 2437     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2438     int entry_count = StubInfo::entry_count(stub_id);
 2439     assert(entry_count == 1, "sanity check");
 2440     address start = load_archive_data(stub_id);
 2441     if (start != nullptr) {
 2442       return start;
 2443     }
 2444     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2445     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2446 
 2447     __ align(CodeEntryAlignment);
 2448     StubCodeMark mark(this, stub_id);
 2449     start = __ pc();
 2450     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2451 
 2452     // bump this on entry, not on exit:
 2453     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2454 
 2455     __ orr(rscratch1, s, d);
 2456     __ orr(rscratch1, rscratch1, count);
 2457 
 2458     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2459     __ cbz(rscratch1, L_long_aligned);
 2460     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2461     __ cbz(rscratch1, L_int_aligned);
 2462     __ tbz(rscratch1, 0, L_short_aligned);
 2463     __ b(RuntimeAddress(byte_copy_entry));
 2464 
 2465     __ BIND(L_short_aligned);
 2466     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2467     __ b(RuntimeAddress(short_copy_entry));
 2468     __ BIND(L_int_aligned);
 2469     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2470     __ b(RuntimeAddress(int_copy_entry));
 2471     __ BIND(L_long_aligned);
 2472     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2473     __ b(RuntimeAddress(long_copy_entry));
 2474 
 2475     // record the stub entry and end
 2476     store_archive_data(stub_id, start, __ pc());
 2477 
 2478     return start;
 2479   }
 2480 
 2481   //
 2482   //  Generate generic array copy stubs
 2483   //
 2484   //  Input:
 2485   //    c_rarg0    -  src oop
 2486   //    c_rarg1    -  src_pos (32-bits)
 2487   //    c_rarg2    -  dst oop
 2488   //    c_rarg3    -  dst_pos (32-bits)
 2489   //    c_rarg4    -  element count (32-bits)
 2490   //
 2491   //  Output:
 2492   //    r0 ==  0  -  success
 2493   //    r0 == -1^K - failure, where K is partial transfer count
 2494   //
 2495   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2496                                 address int_copy_entry, address oop_copy_entry,
 2497                                 address long_copy_entry, address checkcast_copy_entry) {
 2498     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2499     int entry_count = StubInfo::entry_count(stub_id);
 2500     assert(entry_count == 1, "sanity check");
 2501     address start = load_archive_data(stub_id);
 2502     if (start != nullptr) {
 2503       return start;
 2504     }
 2505     Label L_failed, L_objArray;
 2506     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2507 
 2508     // Input registers
 2509     const Register src        = c_rarg0;  // source array oop
 2510     const Register src_pos    = c_rarg1;  // source position
 2511     const Register dst        = c_rarg2;  // destination array oop
 2512     const Register dst_pos    = c_rarg3;  // destination position
 2513     const Register length     = c_rarg4;
 2514 
 2515 
 2516     // Registers used as temps
 2517     const Register dst_klass  = c_rarg5;
 2518 
 2519     __ align(CodeEntryAlignment);
 2520 
 2521     StubCodeMark mark(this, stub_id);
 2522 
 2523     start = __ pc();
 2524 
 2525     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2526 
 2527     // bump this on entry, not on exit:
 2528     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2529 
 2530     //-----------------------------------------------------------------------
 2531     // Assembler stub will be used for this call to arraycopy
 2532     // if the following conditions are met:
 2533     //
 2534     // (1) src and dst must not be null.
 2535     // (2) src_pos must not be negative.
 2536     // (3) dst_pos must not be negative.
 2537     // (4) length  must not be negative.
 2538     // (5) src klass and dst klass should be the same and not null.
 2539     // (6) src and dst should be arrays.
 2540     // (7) src_pos + length must not exceed length of src.
 2541     // (8) dst_pos + length must not exceed length of dst.
 2542     //
 2543 
 2544     //  if (src == nullptr) return -1;
 2545     __ cbz(src, L_failed);
 2546 
 2547     //  if (src_pos < 0) return -1;
 2548     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2549 
 2550     //  if (dst == nullptr) return -1;
 2551     __ cbz(dst, L_failed);
 2552 
 2553     //  if (dst_pos < 0) return -1;
 2554     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2555 
 2556     // registers used as temp
 2557     const Register scratch_length    = r16; // elements count to copy
 2558     const Register scratch_src_klass = r17; // array klass
 2559     const Register lh                = r15; // layout helper
 2560 
 2561     //  if (length < 0) return -1;
 2562     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2563     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2564 
 2565     __ load_klass(scratch_src_klass, src);
 2566 #ifdef ASSERT
 2567     //  assert(src->klass() != nullptr);
 2568     {
 2569       BLOCK_COMMENT("assert klasses not null {");
 2570       Label L1, L2;
 2571       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2572       __ bind(L1);
 2573       __ stop("broken null klass");
 2574       __ bind(L2);
 2575       __ load_klass(rscratch1, dst);
 2576       __ cbz(rscratch1, L1);     // this would be broken also
 2577       BLOCK_COMMENT("} assert klasses not null done");
 2578     }
 2579 #endif
 2580 
 2581     // Load layout helper (32-bits)
 2582     //
 2583     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2584     // 32        30    24            16              8     2                 0
 2585     //
 2586     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2587     //
 2588 
 2589     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2590 
 2591     // Handle objArrays completely differently...
 2592     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2593     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2594     __ movw(rscratch1, objArray_lh);
 2595     __ eorw(rscratch2, lh, rscratch1);
 2596     __ cbzw(rscratch2, L_objArray);
 2597 
 2598     //  if (src->klass() != dst->klass()) return -1;
 2599     __ load_klass(rscratch2, dst);
 2600     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2601     __ cbnz(rscratch2, L_failed);
 2602 
 2603     //  if (!src->is_Array()) return -1;
 2604     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2605 
 2606     // At this point, it is known to be a typeArray (array_tag 0x3).
 2607 #ifdef ASSERT
 2608     {
 2609       BLOCK_COMMENT("assert primitive array {");
 2610       Label L;
 2611       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2612       __ cmpw(lh, rscratch2);
 2613       __ br(Assembler::GE, L);
 2614       __ stop("must be a primitive array");
 2615       __ bind(L);
 2616       BLOCK_COMMENT("} assert primitive array done");
 2617     }
 2618 #endif
 2619 
 2620     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2621                            rscratch2, L_failed);
 2622 
 2623     // TypeArrayKlass
 2624     //
 2625     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2626     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2627     //
 2628 
 2629     const Register rscratch1_offset = rscratch1;    // array offset
 2630     const Register r15_elsize = lh; // element size
 2631 
 2632     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2633            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2634     __ add(src, src, rscratch1_offset);           // src array offset
 2635     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2636     BLOCK_COMMENT("choose copy loop based on element size");
 2637 
 2638     // next registers should be set before the jump to corresponding stub
 2639     const Register from     = c_rarg0;  // source array address
 2640     const Register to       = c_rarg1;  // destination array address
 2641     const Register count    = c_rarg2;  // elements count
 2642 
 2643     // 'from', 'to', 'count' registers should be set in such order
 2644     // since they are the same as 'src', 'src_pos', 'dst'.
 2645 
 2646     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2647 
 2648     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2649     // size in bytes).  We do a simple bitwise binary search.
 2650   __ BIND(L_copy_bytes);
 2651     __ tbnz(r15_elsize, 1, L_copy_ints);
 2652     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2653     __ lea(from, Address(src, src_pos));// src_addr
 2654     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2655     __ movw(count, scratch_length); // length
 2656     __ b(RuntimeAddress(byte_copy_entry));
 2657 
 2658   __ BIND(L_copy_shorts);
 2659     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2660     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2661     __ movw(count, scratch_length); // length
 2662     __ b(RuntimeAddress(short_copy_entry));
 2663 
 2664   __ BIND(L_copy_ints);
 2665     __ tbnz(r15_elsize, 0, L_copy_longs);
 2666     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2667     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2668     __ movw(count, scratch_length); // length
 2669     __ b(RuntimeAddress(int_copy_entry));
 2670 
 2671   __ BIND(L_copy_longs);
 2672 #ifdef ASSERT
 2673     {
 2674       BLOCK_COMMENT("assert long copy {");
 2675       Label L;
 2676       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2677       __ cmpw(r15_elsize, LogBytesPerLong);
 2678       __ br(Assembler::EQ, L);
 2679       __ stop("must be long copy, but elsize is wrong");
 2680       __ bind(L);
 2681       BLOCK_COMMENT("} assert long copy done");
 2682     }
 2683 #endif
 2684     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2685     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2686     __ movw(count, scratch_length); // length
 2687     __ b(RuntimeAddress(long_copy_entry));
 2688 
 2689     // ObjArrayKlass
 2690   __ BIND(L_objArray);
 2691     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2692 
 2693     Label L_plain_copy, L_checkcast_copy;
 2694     //  test array classes for subtyping
 2695     __ load_klass(r15, dst);
 2696     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2697     __ br(Assembler::NE, L_checkcast_copy);
 2698 
 2699     // Identically typed arrays can be copied without element-wise checks.
 2700     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2701                            rscratch2, L_failed);
 2702 
 2703     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2704     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2705     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2706     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2707     __ movw(count, scratch_length); // length
 2708   __ BIND(L_plain_copy);
 2709     __ b(RuntimeAddress(oop_copy_entry));
 2710 
 2711   __ BIND(L_checkcast_copy);
 2712     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2713     {
 2714       // Before looking at dst.length, make sure dst is also an objArray.
 2715       __ ldrw(rscratch1, Address(r15, lh_offset));
 2716       __ movw(rscratch2, objArray_lh);
 2717       __ eorw(rscratch1, rscratch1, rscratch2);
 2718       __ cbnzw(rscratch1, L_failed);
 2719 
 2720       // It is safe to examine both src.length and dst.length.
 2721       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2722                              r15, L_failed);
 2723 
 2724       __ load_klass(dst_klass, dst); // reload
 2725 
 2726       // Marshal the base address arguments now, freeing registers.
 2727       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2728       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2729       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2730       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2731       __ movw(count, length);           // length (reloaded)
 2732       Register sco_temp = c_rarg3;      // this register is free now
 2733       assert_different_registers(from, to, count, sco_temp,
 2734                                  dst_klass, scratch_src_klass);
 2735       // assert_clean_int(count, sco_temp);
 2736 
 2737       // Generate the type check.
 2738       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2739       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2740 
 2741       // Smashes rscratch1, rscratch2
 2742       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2743                           L_plain_copy);
 2744 
 2745       // Fetch destination element klass from the ObjArrayKlass header.
 2746       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2747       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2748       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2749 
 2750       // the checkcast_copy loop needs two extra arguments:
 2751       assert(c_rarg3 == sco_temp, "#3 already in place");
 2752       // Set up arguments for checkcast_copy_entry.
 2753       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2754       __ b(RuntimeAddress(checkcast_copy_entry));
 2755     }
 2756 
 2757   __ BIND(L_failed);
 2758     __ mov(r0, -1);
 2759     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2760     __ ret(lr);
 2761 
 2762     // record the stub entry and end
 2763     store_archive_data(stub_id, start, __ pc());
 2764 
 2765     return start;
 2766   }
 2767 
 2768   //
 2769   // Generate stub for array fill. If "aligned" is true, the
 2770   // "to" address is assumed to be heapword aligned.
 2771   //
 2772   // Arguments for generated stub:
 2773   //   to:    c_rarg0
 2774   //   value: c_rarg1
 2775   //   count: c_rarg2 treated as signed
 2776   //
 2777   address generate_fill(StubId stub_id) {
 2778     BasicType t;
 2779     bool aligned;
 2780 
 2781     switch (stub_id) {
 2782     case StubId::stubgen_jbyte_fill_id:
 2783       t = T_BYTE;
 2784       aligned = false;
 2785       break;
 2786     case StubId::stubgen_jshort_fill_id:
 2787       t = T_SHORT;
 2788       aligned = false;
 2789       break;
 2790     case StubId::stubgen_jint_fill_id:
 2791       t = T_INT;
 2792       aligned = false;
 2793       break;
 2794     case StubId::stubgen_arrayof_jbyte_fill_id:
 2795       t = T_BYTE;
 2796       aligned = true;
 2797       break;
 2798     case StubId::stubgen_arrayof_jshort_fill_id:
 2799       t = T_SHORT;
 2800       aligned = true;
 2801       break;
 2802     case StubId::stubgen_arrayof_jint_fill_id:
 2803       t = T_INT;
 2804       aligned = true;
 2805       break;
 2806     default:
 2807       ShouldNotReachHere();
 2808     };
 2809     int entry_count = StubInfo::entry_count(stub_id);
 2810     assert(entry_count == 1, "sanity check");
 2811     address start = load_archive_data(stub_id);
 2812     if (start != nullptr) {
 2813       return start;
 2814     }
 2815     __ align(CodeEntryAlignment);
 2816     StubCodeMark mark(this, stub_id);
 2817     start = __ pc();
 2818 
 2819     BLOCK_COMMENT("Entry:");
 2820 
 2821     const Register to        = c_rarg0;  // source array address
 2822     const Register value     = c_rarg1;  // value
 2823     const Register count     = c_rarg2;  // elements count
 2824 
 2825     const Register bz_base = r10;        // base for block_zero routine
 2826     const Register cnt_words = r11;      // temp register
 2827 
 2828     __ enter();
 2829 
 2830     Label L_fill_elements, L_exit1;
 2831 
 2832     int shift = -1;
 2833     switch (t) {
 2834       case T_BYTE:
 2835         shift = 0;
 2836         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2837         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2838         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2839         __ br(Assembler::LO, L_fill_elements);
 2840         break;
 2841       case T_SHORT:
 2842         shift = 1;
 2843         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2844         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2845         __ br(Assembler::LO, L_fill_elements);
 2846         break;
 2847       case T_INT:
 2848         shift = 2;
 2849         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2850         __ br(Assembler::LO, L_fill_elements);
 2851         break;
 2852       default: ShouldNotReachHere();
 2853     }
 2854 
 2855     // Align source address at 8 bytes address boundary.
 2856     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2857     if (!aligned) {
 2858       switch (t) {
 2859         case T_BYTE:
 2860           // One byte misalignment happens only for byte arrays.
 2861           __ tbz(to, 0, L_skip_align1);
 2862           __ strb(value, Address(__ post(to, 1)));
 2863           __ subw(count, count, 1);
 2864           __ bind(L_skip_align1);
 2865           // Fallthrough
 2866         case T_SHORT:
 2867           // Two bytes misalignment happens only for byte and short (char) arrays.
 2868           __ tbz(to, 1, L_skip_align2);
 2869           __ strh(value, Address(__ post(to, 2)));
 2870           __ subw(count, count, 2 >> shift);
 2871           __ bind(L_skip_align2);
 2872           // Fallthrough
 2873         case T_INT:
 2874           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2875           __ tbz(to, 2, L_skip_align4);
 2876           __ strw(value, Address(__ post(to, 4)));
 2877           __ subw(count, count, 4 >> shift);
 2878           __ bind(L_skip_align4);
 2879           break;
 2880         default: ShouldNotReachHere();
 2881       }
 2882     }
 2883 
 2884     //
 2885     //  Fill large chunks
 2886     //
 2887     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2888     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2889     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2890     if (UseBlockZeroing) {
 2891       Label non_block_zeroing, rest;
 2892       // If the fill value is zero we can use the fast zero_words().
 2893       __ cbnz(value, non_block_zeroing);
 2894       __ mov(bz_base, to);
 2895       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2896       address tpc = __ zero_words(bz_base, cnt_words);
 2897       if (tpc == nullptr) {
 2898         fatal("CodeCache is full at generate_fill");
 2899       }
 2900       __ b(rest);
 2901       __ bind(non_block_zeroing);
 2902       __ fill_words(to, cnt_words, value);
 2903       __ bind(rest);
 2904     } else {
 2905       __ fill_words(to, cnt_words, value);
 2906     }
 2907 
 2908     // Remaining count is less than 8 bytes. Fill it by a single store.
 2909     // Note that the total length is no less than 8 bytes.
 2910     if (t == T_BYTE || t == T_SHORT) {
 2911       Label L_exit1;
 2912       __ cbzw(count, L_exit1);
 2913       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2914       __ str(value, Address(to, -8));    // overwrite some elements
 2915       __ bind(L_exit1);
 2916       __ leave();
 2917       __ ret(lr);
 2918     }
 2919 
 2920     // Handle copies less than 8 bytes.
 2921     Label L_fill_2, L_fill_4, L_exit2;
 2922     __ bind(L_fill_elements);
 2923     switch (t) {
 2924       case T_BYTE:
 2925         __ tbz(count, 0, L_fill_2);
 2926         __ strb(value, Address(__ post(to, 1)));
 2927         __ bind(L_fill_2);
 2928         __ tbz(count, 1, L_fill_4);
 2929         __ strh(value, Address(__ post(to, 2)));
 2930         __ bind(L_fill_4);
 2931         __ tbz(count, 2, L_exit2);
 2932         __ strw(value, Address(to));
 2933         break;
 2934       case T_SHORT:
 2935         __ tbz(count, 0, L_fill_4);
 2936         __ strh(value, Address(__ post(to, 2)));
 2937         __ bind(L_fill_4);
 2938         __ tbz(count, 1, L_exit2);
 2939         __ strw(value, Address(to));
 2940         break;
 2941       case T_INT:
 2942         __ cbzw(count, L_exit2);
 2943         __ strw(value, Address(to));
 2944         break;
 2945       default: ShouldNotReachHere();
 2946     }
 2947     __ bind(L_exit2);
 2948     __ leave();
 2949     __ ret(lr);
 2950 
 2951     // record the stub entry and end
 2952     store_archive_data(stub_id, start, __ pc());
 2953 
 2954     return start;
 2955   }
 2956 
 2957   address generate_unsafecopy_common_error_exit() {
 2958     StubId stub_id = StubId::stubgen_unsafecopy_common_id;
 2959     int entry_count = StubInfo::entry_count(stub_id);
 2960     assert(entry_count == 1, "sanity check");
 2961     address start = load_archive_data(stub_id);
 2962     if (start != nullptr) {
 2963       return start;
 2964     }
 2965     __ align(CodeEntryAlignment);
 2966     StubCodeMark mark(this, stub_id);
 2967     start = __ pc();
 2968       __ leave();
 2969       __ mov(r0, 0);
 2970       __ ret(lr);
 2971 
 2972     // record the stub entry and end
 2973     store_archive_data(stub_id, start, __ pc());
 2974 
 2975     return start;
 2976   }
 2977 
 2978   //
 2979   //  Generate 'unsafe' set memory stub
 2980   //  Though just as safe as the other stubs, it takes an unscaled
 2981   //  size_t (# bytes) argument instead of an element count.
 2982   //
 2983   //  This fill operation is atomicity preserving: as long as the
 2984   //  address supplied is sufficiently aligned, all writes of up to 64
 2985   //  bits in size are single-copy atomic.
 2986   //
 2987   //  Input:
 2988   //    c_rarg0   - destination array address
 2989   //    c_rarg1   - byte count (size_t)
 2990   //    c_rarg2   - byte value
 2991   //
 2992   address generate_unsafe_setmemory() {
 2993     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
 2994     int entry_count = StubInfo::entry_count(stub_id);
 2995     assert(entry_count == 1, "sanity check");
 2996     // we expect one set of extra unsafememory access handler entries
 2997     GrowableArray<address> extras;
 2998     int extra_count =  1 * UnsafeMemoryAccess::COLUMN_COUNT;
 2999     address start = load_archive_data(stub_id, nullptr, &extras);
 3000     if (start != nullptr) {
 3001       assert(extras.length() == extra_count,
 3002              "unexpected extra entry count %d", extras.length());
 3003       register_unsafe_access_handlers(extras, 0, 1);
 3004       return start;
 3005     }
 3006 
 3007     __ align(CodeEntryAlignment);
 3008     StubCodeMark mark(this, stub_id);
 3009     start = __ pc();
 3010 
 3011     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 3012     Label tail;
 3013 
 3014     {
 3015     UnsafeMemoryAccessMark umam(this, true, false);
 3016 
 3017     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3018 
 3019     __ dup(v0, __ T16B, value);
 3020 
 3021     if (AvoidUnalignedAccesses) {
 3022       __ cmp(count, (u1)16);
 3023       __ br(__ LO, tail);
 3024 
 3025       __ mov(rscratch1, 16);
 3026       __ andr(rscratch2, dest, 15);
 3027       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 3028       __ strq(v0, Address(dest));
 3029       __ sub(count, count, rscratch1);
 3030       __ add(dest, dest, rscratch1);
 3031     }
 3032 
 3033     __ subs(count, count, (u1)64);
 3034     __ br(__ LO, tail);
 3035     {
 3036       Label again;
 3037       __ bind(again);
 3038       __ stpq(v0, v0, Address(dest));
 3039       __ stpq(v0, v0, Address(dest, 32));
 3040 
 3041       __ subs(count, count, 64);
 3042       __ add(dest, dest, 64);
 3043       __ br(__ HS, again);
 3044     }
 3045 
 3046     __ bind(tail);
 3047     // The count of bytes is off by 64, but we don't need to correct
 3048     // it because we're only going to use the least-significant few
 3049     // count bits from here on.
 3050     // __ add(count, count, 64);
 3051 
 3052     {
 3053       Label dont;
 3054       __ tbz(count, exact_log2(32), dont);
 3055       __ stpq(v0, v0, __ post(dest, 32));
 3056       __ bind(dont);
 3057     }
 3058     {
 3059       Label dont;
 3060       __ tbz(count, exact_log2(16), dont);
 3061       __ strq(v0, __ post(dest, 16));
 3062       __ bind(dont);
 3063     }
 3064     {
 3065       Label dont;
 3066       __ tbz(count, exact_log2(8), dont);
 3067       __ strd(v0, __ post(dest, 8));
 3068       __ bind(dont);
 3069     }
 3070 
 3071     Label finished;
 3072     __ tst(count, 7);
 3073     __ br(__ EQ, finished);
 3074 
 3075     {
 3076       Label dont;
 3077       __ tbz(count, exact_log2(4), dont);
 3078       __ strs(v0, __ post(dest, 4));
 3079       __ bind(dont);
 3080     }
 3081     {
 3082       Label dont;
 3083       __ tbz(count, exact_log2(2), dont);
 3084       __ bfi(value, value, 8, 8);
 3085       __ strh(value, __ post(dest, 2));
 3086       __ bind(dont);
 3087     }
 3088     {
 3089       Label dont;
 3090       __ tbz(count, exact_log2(1), dont);
 3091       __ strb(value, Address(dest));
 3092       __ bind(dont);
 3093     }
 3094 
 3095     __ bind(finished);
 3096     __ leave();
 3097     __ ret(lr);
 3098     // have to exit the block and destroy the UnsafeMemoryAccessMark
 3099     // in order to retrieve the handler end address
 3100     }
 3101 
 3102     // install saved handler addresses in extras
 3103     address end = __ pc();
 3104     retrieve_unsafe_access_handlers(start, end, extras);
 3105     assert(extras.length() == extra_count,
 3106            "incorrect handlers count %d", extras.length());
 3107     // record the stub entry and end plus the extras
 3108     store_archive_data(stub_id, start, end, nullptr, &extras);
 3109 
 3110     return start;
 3111   }
 3112 
 3113   address generate_data_cache_writeback() {
 3114     const Register line        = c_rarg0;  // address of line to write back
 3115 
 3116     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 3117     int entry_count = StubInfo::entry_count(stub_id);
 3118     assert(entry_count == 1, "sanity check");
 3119     address start = load_archive_data(stub_id);
 3120     if (start != nullptr) {
 3121       return start;
 3122     }
 3123     __ align(CodeEntryAlignment);
 3124     StubCodeMark mark(this, stub_id);
 3125 
 3126     start = __ pc();
 3127     __ enter();
 3128     __ cache_wb(Address(line, 0));
 3129     __ leave();
 3130     __ ret(lr);
 3131 
 3132     // record the stub entry and end
 3133     store_archive_data(stub_id, start, __ pc());
 3134 
 3135     return start;
 3136   }
 3137 
 3138   address generate_data_cache_writeback_sync() {
 3139     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 3140     int entry_count = StubInfo::entry_count(stub_id);
 3141     assert(entry_count == 1, "sanity check");
 3142     address start = load_archive_data(stub_id);
 3143     if (start != nullptr) {
 3144       return start;
 3145     }
 3146     const Register is_pre     = c_rarg0;  // pre or post sync
 3147     __ align(CodeEntryAlignment);
 3148     StubCodeMark mark(this, stub_id);
 3149 
 3150     // pre wbsync is a no-op
 3151     // post wbsync translates to an sfence
 3152 
 3153     Label skip;
 3154     start = __ pc();
 3155     __ enter();
 3156     __ cbnz(is_pre, skip);
 3157     __ cache_wbsync(false);
 3158     __ bind(skip);
 3159     __ leave();
 3160     __ ret(lr);
 3161 
 3162     // record the stub entry and end
 3163     store_archive_data(stub_id, start, __ pc());
 3164 
 3165     return start;
 3166   }
 3167 
 3168   void generate_arraycopy_stubs() {
 3169     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 3170     // entry immediately following their stack push. This can be used
 3171     // as a post-push branch target for compatible stubs when they
 3172     // identify a special case that can be handled by the fallback
 3173     // stub e.g a disjoint copy stub may be use as a special case
 3174     // fallback for its compatible conjoint copy stub.
 3175     //
 3176     // A no push entry is always returned in the following local and
 3177     // then published by assigning to the appropriate entry field in
 3178     // class StubRoutines. The entry value is then passed to the
 3179     // generator for the compatible stub. That means the entry must be
 3180     // listed when saving to/restoring from the AOT cache, ensuring
 3181     // that the inter-stub jumps are noted at AOT-cache save and
 3182     // relocated at AOT cache load.
 3183     address nopush_entry;
 3184 
 3185     // generate the common exit first so later stubs can rely on it if
 3186     // they want an UnsafeMemoryAccess exit non-local to the stub
 3187     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 3188     // register the stub as the default exit with class UnsafeMemoryAccess
 3189     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 3190 
 3191     // generate and publish arch64-specific bulk copy routines first
 3192     // so we can call them from other copy stubs
 3193     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3194     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3195 
 3196     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3197     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3198 
 3199     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3200     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3201 
 3202     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 3203 
 3204     //*** jbyte
 3205     // Always need aligned and unaligned versions
 3206     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3207     // disjoint nopush entry is needed by conjoint copy
 3208     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3209     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 3210     // conjoint nopush entry is needed by generic/unsafe copy
 3211     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 3212     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3213     // disjoint arrayof nopush entry is needed by conjoint copy
 3214     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3215     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 3216 
 3217     //*** jshort
 3218     // Always need aligned and unaligned versions
 3219     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 3220     // disjoint nopush entry is needed by conjoint copy
 3221     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 3222     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 3223     // conjoint nopush entry is used by generic/unsafe copy
 3224     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 3225     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 3226     // disjoint arrayof nopush entry is needed by conjoint copy
 3227     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 3228     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 3229 
 3230     //*** jint
 3231     // Aligned versions
 3232     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 3233     // disjoint arrayof nopush entry is needed by conjoint copy
 3234     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 3235     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 3236     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 3237     // jint_arraycopy_nopush always points to the unaligned version
 3238     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 3239     // disjoint nopush entry is needed by conjoint copy
 3240     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 3241     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 3242     // conjoint nopush entry is needed by generic/unsafe copy
 3243     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 3244 
 3245     //*** jlong
 3246     // It is always aligned
 3247     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 3248     // disjoint arrayof nopush entry is needed by conjoint copy
 3249     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 3250     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 3251     // conjoint nopush entry is needed by generic/unsafe copy
 3252     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 3253     // disjoint normal/nopush and conjoint normal entries are not
 3254     // generated since the arrayof versions are the same
 3255     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 3256     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 3257     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 3258 
 3259     //*** oops
 3260     {
 3261       StubRoutines::_arrayof_oop_disjoint_arraycopy
 3262         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 3263       // disjoint arrayof nopush entry is needed by conjoint copy
 3264       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 3265       StubRoutines::_arrayof_oop_arraycopy
 3266         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 3267       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 3268       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 3269       // Aligned versions without pre-barriers
 3270       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 3271         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 3272       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 3273       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 3274       // note that we don't need a returned nopush entry because the
 3275       // generic/unsafe copy does not cater for uninit arrays.
 3276       StubRoutines::_arrayof_oop_arraycopy_uninit
 3277         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 3278     }
 3279 
 3280     // for oop copies reuse arrayof entries for non-arrayof cases
 3281     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 3282     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 3283     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 3284     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 3285     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 3286     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 3287 
 3288     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 3289     // checkcast nopush entry is needed by generic copy
 3290     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 3291     // note that we don't need a returned nopush entry because the
 3292     // generic copy does not cater for uninit arrays.
 3293     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 3294 
 3295     // unsafe arraycopy may fallback on conjoint stubs
 3296     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3297                                                               StubRoutines::_jshort_arraycopy_nopush,
 3298                                                               StubRoutines::_jint_arraycopy_nopush,
 3299                                                               StubRoutines::_jlong_arraycopy_nopush);
 3300 
 3301     // generic arraycopy may fallback on conjoint stubs
 3302     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3303                                                                StubRoutines::_jshort_arraycopy_nopush,
 3304                                                                StubRoutines::_jint_arraycopy_nopush,
 3305                                                                StubRoutines::_oop_arraycopy_nopush,
 3306                                                                StubRoutines::_jlong_arraycopy_nopush,
 3307                                                                StubRoutines::_checkcast_arraycopy_nopush);
 3308 
 3309     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 3310     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 3311     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 3312     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 3313     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 3314     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 3315   }
 3316 
 3317   void generate_math_stubs() { Unimplemented(); }
 3318 
 3319   // Arguments:
 3320   //
 3321   // Inputs:
 3322   //   c_rarg0   - source byte array address
 3323   //   c_rarg1   - destination byte array address
 3324   //   c_rarg2   - sessionKe (key) in little endian int array
 3325   //
 3326   address generate_aescrypt_encryptBlock() {
 3327     assert(UseAES, "need AES cryptographic extension support");
 3328     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 3329     int entry_count = StubInfo::entry_count(stub_id);
 3330     assert(entry_count == 1, "sanity check");
 3331     address start = load_archive_data(stub_id);
 3332     if (start != nullptr) {
 3333       return start;
 3334     }
 3335     __ align(CodeEntryAlignment);
 3336     StubCodeMark mark(this, stub_id);
 3337 
 3338     const Register from        = c_rarg0;  // source array address
 3339     const Register to          = c_rarg1;  // destination array address
 3340     const Register key         = c_rarg2;  // key array address
 3341     const Register keylen      = rscratch1;
 3342 
 3343     start = __ pc();
 3344     __ enter();
 3345 
 3346     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3347 
 3348     __ aesenc_loadkeys(key, keylen);
 3349     __ aesecb_encrypt(from, to, keylen);
 3350 
 3351     __ mov(r0, 0);
 3352 
 3353     __ leave();
 3354     __ ret(lr);
 3355 
 3356     // record the stub entry and end
 3357     store_archive_data(stub_id, start, __ pc());
 3358 
 3359     return start;
 3360   }
 3361 
 3362   // Arguments:
 3363   //
 3364   // Inputs:
 3365   //   c_rarg0   - source byte array address
 3366   //   c_rarg1   - destination byte array address
 3367   //   c_rarg2   - sessionKd (key) in little endian int array
 3368   //
 3369   address generate_aescrypt_decryptBlock() {
 3370     assert(UseAES, "need AES cryptographic extension support");
 3371     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 3372     int entry_count = StubInfo::entry_count(stub_id);
 3373     assert(entry_count == 1, "sanity check");
 3374     address start = load_archive_data(stub_id);
 3375     if (start != nullptr) {
 3376       return start;
 3377     }
 3378     __ align(CodeEntryAlignment);
 3379     StubCodeMark mark(this, stub_id);
 3380     Label L_doLast;
 3381 
 3382     const Register from        = c_rarg0;  // source array address
 3383     const Register to          = c_rarg1;  // destination array address
 3384     const Register key         = c_rarg2;  // key array address
 3385     const Register keylen      = rscratch1;
 3386 
 3387     start = __ pc();
 3388     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3389 
 3390     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3391 
 3392     __ aesecb_decrypt(from, to, key, keylen);
 3393 
 3394     __ mov(r0, 0);
 3395 
 3396     __ leave();
 3397     __ ret(lr);
 3398 
 3399     // record the stub entry and end
 3400     store_archive_data(stub_id, start, __ pc());
 3401 
 3402     return start;
 3403   }
 3404 
 3405   // Arguments:
 3406   //
 3407   // Inputs:
 3408   //   c_rarg0   - source byte array address
 3409   //   c_rarg1   - destination byte array address
 3410   //   c_rarg2   - sessionKe (key) in little endian int array
 3411   //   c_rarg3   - r vector byte array address
 3412   //   c_rarg4   - input length
 3413   //
 3414   // Output:
 3415   //   x0        - input length
 3416   //
 3417   address generate_cipherBlockChaining_encryptAESCrypt() {
 3418     assert(UseAES, "need AES cryptographic extension support");
 3419     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 3420     int entry_count = StubInfo::entry_count(stub_id);
 3421     assert(entry_count == 1, "sanity check");
 3422     address start = load_archive_data(stub_id);
 3423     if (start != nullptr) {
 3424       return start;
 3425     }
 3426     __ align(CodeEntryAlignment);
 3427     StubCodeMark mark(this, stub_id);
 3428 
 3429     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3430 
 3431     const Register from        = c_rarg0;  // source array address
 3432     const Register to          = c_rarg1;  // destination array address
 3433     const Register key         = c_rarg2;  // key array address
 3434     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3435                                            // and left with the results of the last encryption block
 3436     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3437     const Register keylen      = rscratch1;
 3438 
 3439     start = __ pc();
 3440 
 3441       __ enter();
 3442 
 3443       __ movw(rscratch2, len_reg);
 3444 
 3445       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3446 
 3447       __ ld1(v0, __ T16B, rvec);
 3448 
 3449       __ cmpw(keylen, 52);
 3450       __ br(Assembler::CC, L_loadkeys_44);
 3451       __ br(Assembler::EQ, L_loadkeys_52);
 3452 
 3453       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3454       __ rev32(v17, __ T16B, v17);
 3455       __ rev32(v18, __ T16B, v18);
 3456     __ BIND(L_loadkeys_52);
 3457       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3458       __ rev32(v19, __ T16B, v19);
 3459       __ rev32(v20, __ T16B, v20);
 3460     __ BIND(L_loadkeys_44);
 3461       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3462       __ rev32(v21, __ T16B, v21);
 3463       __ rev32(v22, __ T16B, v22);
 3464       __ rev32(v23, __ T16B, v23);
 3465       __ rev32(v24, __ T16B, v24);
 3466       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3467       __ rev32(v25, __ T16B, v25);
 3468       __ rev32(v26, __ T16B, v26);
 3469       __ rev32(v27, __ T16B, v27);
 3470       __ rev32(v28, __ T16B, v28);
 3471       __ ld1(v29, v30, v31, __ T16B, key);
 3472       __ rev32(v29, __ T16B, v29);
 3473       __ rev32(v30, __ T16B, v30);
 3474       __ rev32(v31, __ T16B, v31);
 3475 
 3476     __ BIND(L_aes_loop);
 3477       __ ld1(v1, __ T16B, __ post(from, 16));
 3478       __ eor(v0, __ T16B, v0, v1);
 3479 
 3480       __ br(Assembler::CC, L_rounds_44);
 3481       __ br(Assembler::EQ, L_rounds_52);
 3482 
 3483       __ aese(v0, v17); __ aesmc(v0, v0);
 3484       __ aese(v0, v18); __ aesmc(v0, v0);
 3485     __ BIND(L_rounds_52);
 3486       __ aese(v0, v19); __ aesmc(v0, v0);
 3487       __ aese(v0, v20); __ aesmc(v0, v0);
 3488     __ BIND(L_rounds_44);
 3489       __ aese(v0, v21); __ aesmc(v0, v0);
 3490       __ aese(v0, v22); __ aesmc(v0, v0);
 3491       __ aese(v0, v23); __ aesmc(v0, v0);
 3492       __ aese(v0, v24); __ aesmc(v0, v0);
 3493       __ aese(v0, v25); __ aesmc(v0, v0);
 3494       __ aese(v0, v26); __ aesmc(v0, v0);
 3495       __ aese(v0, v27); __ aesmc(v0, v0);
 3496       __ aese(v0, v28); __ aesmc(v0, v0);
 3497       __ aese(v0, v29); __ aesmc(v0, v0);
 3498       __ aese(v0, v30);
 3499       __ eor(v0, __ T16B, v0, v31);
 3500 
 3501       __ st1(v0, __ T16B, __ post(to, 16));
 3502 
 3503       __ subw(len_reg, len_reg, 16);
 3504       __ cbnzw(len_reg, L_aes_loop);
 3505 
 3506       __ st1(v0, __ T16B, rvec);
 3507 
 3508       __ mov(r0, rscratch2);
 3509 
 3510       __ leave();
 3511       __ ret(lr);
 3512 
 3513       // record the stub entry and end
 3514       store_archive_data(stub_id, start, __ pc());
 3515 
 3516       return start;
 3517   }
 3518 
 3519   // Arguments:
 3520   //
 3521   // Inputs:
 3522   //   c_rarg0   - source byte array address
 3523   //   c_rarg1   - destination byte array address
 3524   //   c_rarg2   - sessionKd (key) in little endian int array
 3525   //   c_rarg3   - r vector byte array address
 3526   //   c_rarg4   - input length
 3527   //
 3528   // Output:
 3529   //   r0        - input length
 3530   //
 3531   address generate_cipherBlockChaining_decryptAESCrypt() {
 3532     assert(UseAES, "need AES cryptographic extension support");
 3533     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3534     int entry_count = StubInfo::entry_count(stub_id);
 3535     assert(entry_count == 1, "sanity check");
 3536     address start = load_archive_data(stub_id);
 3537     if (start != nullptr) {
 3538       return start;
 3539     }
 3540     __ align(CodeEntryAlignment);
 3541     StubCodeMark mark(this, stub_id);
 3542 
 3543     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3544 
 3545     const Register from        = c_rarg0;  // source array address
 3546     const Register to          = c_rarg1;  // destination array address
 3547     const Register key         = c_rarg2;  // key array address
 3548     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3549                                            // and left with the results of the last encryption block
 3550     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3551     const Register keylen      = rscratch1;
 3552 
 3553     start = __ pc();
 3554 
 3555       __ enter();
 3556 
 3557       __ movw(rscratch2, len_reg);
 3558 
 3559       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3560 
 3561       __ ld1(v2, __ T16B, rvec);
 3562 
 3563       __ ld1(v31, __ T16B, __ post(key, 16));
 3564       __ rev32(v31, __ T16B, v31);
 3565 
 3566       __ cmpw(keylen, 52);
 3567       __ br(Assembler::CC, L_loadkeys_44);
 3568       __ br(Assembler::EQ, L_loadkeys_52);
 3569 
 3570       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3571       __ rev32(v17, __ T16B, v17);
 3572       __ rev32(v18, __ T16B, v18);
 3573     __ BIND(L_loadkeys_52);
 3574       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3575       __ rev32(v19, __ T16B, v19);
 3576       __ rev32(v20, __ T16B, v20);
 3577     __ BIND(L_loadkeys_44);
 3578       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3579       __ rev32(v21, __ T16B, v21);
 3580       __ rev32(v22, __ T16B, v22);
 3581       __ rev32(v23, __ T16B, v23);
 3582       __ rev32(v24, __ T16B, v24);
 3583       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3584       __ rev32(v25, __ T16B, v25);
 3585       __ rev32(v26, __ T16B, v26);
 3586       __ rev32(v27, __ T16B, v27);
 3587       __ rev32(v28, __ T16B, v28);
 3588       __ ld1(v29, v30, __ T16B, key);
 3589       __ rev32(v29, __ T16B, v29);
 3590       __ rev32(v30, __ T16B, v30);
 3591 
 3592     __ BIND(L_aes_loop);
 3593       __ ld1(v0, __ T16B, __ post(from, 16));
 3594       __ orr(v1, __ T16B, v0, v0);
 3595 
 3596       __ br(Assembler::CC, L_rounds_44);
 3597       __ br(Assembler::EQ, L_rounds_52);
 3598 
 3599       __ aesd(v0, v17); __ aesimc(v0, v0);
 3600       __ aesd(v0, v18); __ aesimc(v0, v0);
 3601     __ BIND(L_rounds_52);
 3602       __ aesd(v0, v19); __ aesimc(v0, v0);
 3603       __ aesd(v0, v20); __ aesimc(v0, v0);
 3604     __ BIND(L_rounds_44);
 3605       __ aesd(v0, v21); __ aesimc(v0, v0);
 3606       __ aesd(v0, v22); __ aesimc(v0, v0);
 3607       __ aesd(v0, v23); __ aesimc(v0, v0);
 3608       __ aesd(v0, v24); __ aesimc(v0, v0);
 3609       __ aesd(v0, v25); __ aesimc(v0, v0);
 3610       __ aesd(v0, v26); __ aesimc(v0, v0);
 3611       __ aesd(v0, v27); __ aesimc(v0, v0);
 3612       __ aesd(v0, v28); __ aesimc(v0, v0);
 3613       __ aesd(v0, v29); __ aesimc(v0, v0);
 3614       __ aesd(v0, v30);
 3615       __ eor(v0, __ T16B, v0, v31);
 3616       __ eor(v0, __ T16B, v0, v2);
 3617 
 3618       __ st1(v0, __ T16B, __ post(to, 16));
 3619       __ orr(v2, __ T16B, v1, v1);
 3620 
 3621       __ subw(len_reg, len_reg, 16);
 3622       __ cbnzw(len_reg, L_aes_loop);
 3623 
 3624       __ st1(v2, __ T16B, rvec);
 3625 
 3626       __ mov(r0, rscratch2);
 3627 
 3628       __ leave();
 3629       __ ret(lr);
 3630 
 3631     // record the stub entry and end
 3632     store_archive_data(stub_id, start, __ pc());
 3633 
 3634     return start;
 3635   }
 3636 
 3637   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3638   // Inputs: 128-bits. in is preserved.
 3639   // The least-significant 64-bit word is in the upper dword of each vector.
 3640   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3641   // Output: result
 3642   void be_add_128_64(FloatRegister result, FloatRegister in,
 3643                      FloatRegister inc, FloatRegister tmp) {
 3644     assert_different_registers(result, tmp, inc);
 3645 
 3646     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3647                                            // input
 3648     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3649     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3650                                            // MSD == 0 (must be!) to LSD
 3651     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3652   }
 3653 
 3654   // CTR AES crypt.
 3655   // Arguments:
 3656   //
 3657   // Inputs:
 3658   //   c_rarg0   - source byte array address
 3659   //   c_rarg1   - destination byte array address
 3660   //   c_rarg2   - sessionKe (key) in little endian int array
 3661   //   c_rarg3   - counter vector byte array address
 3662   //   c_rarg4   - input length
 3663   //   c_rarg5   - saved encryptedCounter start
 3664   //   c_rarg6   - saved used length
 3665   //
 3666   // Output:
 3667   //   r0       - input length
 3668   //
 3669   address generate_counterMode_AESCrypt() {
 3670     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3671     int entry_count = StubInfo::entry_count(stub_id);
 3672     assert(entry_count == 1, "sanity check");
 3673     address start = load_archive_data(stub_id);
 3674     if (start != nullptr) {
 3675       return start;
 3676     }
 3677     const Register in = c_rarg0;
 3678     const Register out = c_rarg1;
 3679     const Register key = c_rarg2;
 3680     const Register counter = c_rarg3;
 3681     const Register saved_len = c_rarg4, len = r10;
 3682     const Register saved_encrypted_ctr = c_rarg5;
 3683     const Register used_ptr = c_rarg6, used = r12;
 3684 
 3685     const Register offset = r7;
 3686     const Register keylen = r11;
 3687 
 3688     const unsigned char block_size = 16;
 3689     const int bulk_width = 4;
 3690     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3691     // performance with larger data sizes, but it also means that the
 3692     // fast path isn't used until you have at least 8 blocks, and up
 3693     // to 127 bytes of data will be executed on the slow path. For
 3694     // that reason, and also so as not to blow away too much icache, 4
 3695     // blocks seems like a sensible compromise.
 3696 
 3697     // Algorithm:
 3698     //
 3699     //    if (len == 0) {
 3700     //        goto DONE;
 3701     //    }
 3702     //    int result = len;
 3703     //    do {
 3704     //        if (used >= blockSize) {
 3705     //            if (len >= bulk_width * blockSize) {
 3706     //                CTR_large_block();
 3707     //                if (len == 0)
 3708     //                    goto DONE;
 3709     //            }
 3710     //            for (;;) {
 3711     //                16ByteVector v0 = counter;
 3712     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3713     //                used = 0;
 3714     //                if (len < blockSize)
 3715     //                    break;    /* goto NEXT */
 3716     //                16ByteVector v1 = load16Bytes(in, offset);
 3717     //                v1 = v1 ^ encryptedCounter;
 3718     //                store16Bytes(out, offset);
 3719     //                used = blockSize;
 3720     //                offset += blockSize;
 3721     //                len -= blockSize;
 3722     //                if (len == 0)
 3723     //                    goto DONE;
 3724     //            }
 3725     //        }
 3726     //      NEXT:
 3727     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3728     //        len--;
 3729     //    } while (len != 0);
 3730     //  DONE:
 3731     //    return result;
 3732     //
 3733     // CTR_large_block()
 3734     //    Wide bulk encryption of whole blocks.
 3735 
 3736     __ align(CodeEntryAlignment);
 3737     StubCodeMark mark(this, stub_id);
 3738     start = __ pc();
 3739     __ enter();
 3740 
 3741     Label DONE, CTR_large_block, large_block_return;
 3742     __ ldrw(used, Address(used_ptr));
 3743     __ cbzw(saved_len, DONE);
 3744 
 3745     __ mov(len, saved_len);
 3746     __ mov(offset, 0);
 3747 
 3748     // Compute #rounds for AES based on the length of the key array
 3749     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3750 
 3751     __ aesenc_loadkeys(key, keylen);
 3752 
 3753     {
 3754       Label L_CTR_loop, NEXT;
 3755 
 3756       __ bind(L_CTR_loop);
 3757 
 3758       __ cmp(used, block_size);
 3759       __ br(__ LO, NEXT);
 3760 
 3761       // Maybe we have a lot of data
 3762       __ subsw(rscratch1, len, bulk_width * block_size);
 3763       __ br(__ HS, CTR_large_block);
 3764       __ BIND(large_block_return);
 3765       __ cbzw(len, DONE);
 3766 
 3767       // Setup the counter
 3768       __ movi(v4, __ T4S, 0);
 3769       __ movi(v5, __ T4S, 1);
 3770       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3771 
 3772       // 128-bit big-endian increment
 3773       __ ld1(v0, __ T16B, counter);
 3774       __ rev64(v16, __ T16B, v0);
 3775       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3776       __ rev64(v16, __ T16B, v16);
 3777       __ st1(v16, __ T16B, counter);
 3778       // Previous counter value is in v0
 3779       // v4 contains { 0, 1 }
 3780 
 3781       {
 3782         // We have fewer than bulk_width blocks of data left. Encrypt
 3783         // them one by one until there is less than a full block
 3784         // remaining, being careful to save both the encrypted counter
 3785         // and the counter.
 3786 
 3787         Label inner_loop;
 3788         __ bind(inner_loop);
 3789         // Counter to encrypt is in v0
 3790         __ aesecb_encrypt(noreg, noreg, keylen);
 3791         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3792 
 3793         // Do we have a remaining full block?
 3794 
 3795         __ mov(used, 0);
 3796         __ cmp(len, block_size);
 3797         __ br(__ LO, NEXT);
 3798 
 3799         // Yes, we have a full block
 3800         __ ldrq(v1, Address(in, offset));
 3801         __ eor(v1, __ T16B, v1, v0);
 3802         __ strq(v1, Address(out, offset));
 3803         __ mov(used, block_size);
 3804         __ add(offset, offset, block_size);
 3805 
 3806         __ subw(len, len, block_size);
 3807         __ cbzw(len, DONE);
 3808 
 3809         // Increment the counter, store it back
 3810         __ orr(v0, __ T16B, v16, v16);
 3811         __ rev64(v16, __ T16B, v16);
 3812         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3813         __ rev64(v16, __ T16B, v16);
 3814         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3815 
 3816         __ b(inner_loop);
 3817       }
 3818 
 3819       __ BIND(NEXT);
 3820 
 3821       // Encrypt a single byte, and loop.
 3822       // We expect this to be a rare event.
 3823       __ ldrb(rscratch1, Address(in, offset));
 3824       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3825       __ eor(rscratch1, rscratch1, rscratch2);
 3826       __ strb(rscratch1, Address(out, offset));
 3827       __ add(offset, offset, 1);
 3828       __ add(used, used, 1);
 3829       __ subw(len, len,1);
 3830       __ cbnzw(len, L_CTR_loop);
 3831     }
 3832 
 3833     __ bind(DONE);
 3834     __ strw(used, Address(used_ptr));
 3835     __ mov(r0, saved_len);
 3836 
 3837     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3838     __ ret(lr);
 3839 
 3840     // Bulk encryption
 3841 
 3842     __ BIND (CTR_large_block);
 3843     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3844 
 3845     if (bulk_width == 8) {
 3846       __ sub(sp, sp, 4 * 16);
 3847       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3848     }
 3849     __ sub(sp, sp, 4 * 16);
 3850     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3851     RegSet saved_regs = (RegSet::of(in, out, offset)
 3852                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3853     __ push(saved_regs, sp);
 3854     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3855     __ add(in, in, offset);
 3856     __ add(out, out, offset);
 3857 
 3858     // Keys should already be loaded into the correct registers
 3859 
 3860     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3861     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3862 
 3863     // AES/CTR loop
 3864     {
 3865       Label L_CTR_loop;
 3866       __ BIND(L_CTR_loop);
 3867 
 3868       // Setup the counters
 3869       __ movi(v8, __ T4S, 0);
 3870       __ movi(v9, __ T4S, 1);
 3871       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3872 
 3873       for (int i = 0; i < bulk_width; i++) {
 3874         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3875         __ rev64(v0_ofs, __ T16B, v16);
 3876         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3877       }
 3878 
 3879       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3880 
 3881       // Encrypt the counters
 3882       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3883 
 3884       if (bulk_width == 8) {
 3885         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3886       }
 3887 
 3888       // XOR the encrypted counters with the inputs
 3889       for (int i = 0; i < bulk_width; i++) {
 3890         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3891         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3892         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3893       }
 3894 
 3895       // Write the encrypted data
 3896       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3897       if (bulk_width == 8) {
 3898         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3899       }
 3900 
 3901       __ subw(len, len, 16 * bulk_width);
 3902       __ cbnzw(len, L_CTR_loop);
 3903     }
 3904 
 3905     // Save the counter back where it goes
 3906     __ rev64(v16, __ T16B, v16);
 3907     __ st1(v16, __ T16B, counter);
 3908 
 3909     __ pop(saved_regs, sp);
 3910 
 3911     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3912     if (bulk_width == 8) {
 3913       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3914     }
 3915 
 3916     __ andr(rscratch1, len, -16 * bulk_width);
 3917     __ sub(len, len, rscratch1);
 3918     __ add(offset, offset, rscratch1);
 3919     __ mov(used, 16);
 3920     __ strw(used, Address(used_ptr));
 3921     __ b(large_block_return);
 3922 
 3923     // record the stub entry and end
 3924     store_archive_data(stub_id, start, __ pc());
 3925 
 3926     return start;
 3927   }
 3928 
 3929   // Vector AES Galois Counter Mode implementation. Parameters:
 3930   //
 3931   // in = c_rarg0
 3932   // len = c_rarg1
 3933   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3934   // out = c_rarg3
 3935   // key = c_rarg4
 3936   // state = c_rarg5 - GHASH.state
 3937   // subkeyHtbl = c_rarg6 - powers of H
 3938   // counter = c_rarg7 - 16 bytes of CTR
 3939   // return - number of processed bytes
 3940   address generate_galoisCounterMode_AESCrypt() {
 3941     Label ghash_polynomial; // local data generated after code
 3942     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3943     int entry_count = StubInfo::entry_count(stub_id);
 3944     assert(entry_count == 1, "sanity check");
 3945     address start = load_archive_data(stub_id);
 3946     if (start != nullptr) {
 3947       return start;
 3948     }
 3949     __ align(CodeEntryAlignment);
 3950     StubCodeMark mark(this, stub_id);
 3951     start = __ pc();
 3952     __ enter();
 3953 
 3954     const Register in = c_rarg0;
 3955     const Register len = c_rarg1;
 3956     const Register ct = c_rarg2;
 3957     const Register out = c_rarg3;
 3958     // and updated with the incremented counter in the end
 3959 
 3960     const Register key = c_rarg4;
 3961     const Register state = c_rarg5;
 3962 
 3963     const Register subkeyHtbl = c_rarg6;
 3964 
 3965     const Register counter = c_rarg7;
 3966 
 3967     const Register keylen = r10;
 3968     // Save state before entering routine
 3969     __ sub(sp, sp, 4 * 16);
 3970     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3971     __ sub(sp, sp, 4 * 16);
 3972     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3973 
 3974     // __ andr(len, len, -512);
 3975     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3976     __ str(len, __ pre(sp, -2 * wordSize));
 3977 
 3978     Label DONE;
 3979     __ cbz(len, DONE);
 3980 
 3981     // Compute #rounds for AES based on the length of the key array
 3982     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3983 
 3984     __ aesenc_loadkeys(key, keylen);
 3985     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3986     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3987 
 3988     // AES/CTR loop
 3989     {
 3990       Label L_CTR_loop;
 3991       __ BIND(L_CTR_loop);
 3992 
 3993       // Setup the counters
 3994       __ movi(v8, __ T4S, 0);
 3995       __ movi(v9, __ T4S, 1);
 3996       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3997 
 3998       assert(v0->encoding() < v8->encoding(), "");
 3999       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 4000         FloatRegister f = as_FloatRegister(i);
 4001         __ rev32(f, __ T16B, v16);
 4002         __ addv(v16, __ T4S, v16, v8);
 4003       }
 4004 
 4005       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 4006 
 4007       // Encrypt the counters
 4008       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 4009 
 4010       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 4011 
 4012       // XOR the encrypted counters with the inputs
 4013       for (int i = 0; i < 8; i++) {
 4014         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 4015         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 4016         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 4017       }
 4018       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 4019       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 4020 
 4021       __ subw(len, len, 16 * 8);
 4022       __ cbnzw(len, L_CTR_loop);
 4023     }
 4024 
 4025     __ rev32(v16, __ T16B, v16);
 4026     __ st1(v16, __ T16B, counter);
 4027 
 4028     __ ldr(len, Address(sp));
 4029     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 4030 
 4031     // GHASH/CTR loop
 4032     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 4033                                 len, /*unrolls*/4);
 4034 
 4035 #ifdef ASSERT
 4036     { Label L;
 4037       __ cmp(len, (unsigned char)0);
 4038       __ br(Assembler::EQ, L);
 4039       __ stop("stubGenerator: abort");
 4040       __ bind(L);
 4041   }
 4042 #endif
 4043 
 4044   __ bind(DONE);
 4045     // Return the number of bytes processed
 4046     __ ldr(r0, __ post(sp, 2 * wordSize));
 4047 
 4048     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 4049     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 4050 
 4051     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4052     __ ret(lr);
 4053 
 4054     // bind label and generate polynomial data
 4055     __ align(wordSize * 2);
 4056     __ bind(ghash_polynomial);
 4057     __ emit_int64(0x87);  // The low-order bits of the field
 4058                           // polynomial (i.e. p = z^7+z^2+z+1)
 4059                           // repeated in the low and high parts of a
 4060                           // 128-bit vector
 4061     __ emit_int64(0x87);
 4062 
 4063     // record the stub entry and end
 4064     store_archive_data(stub_id, start, __ pc());
 4065 
 4066     return start;
 4067   }
 4068 
 4069   class Cached64Bytes {
 4070   private:
 4071     MacroAssembler *_masm;
 4072     Register _regs[8];
 4073 
 4074   public:
 4075     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 4076       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 4077       auto it = rs.begin();
 4078       for (auto &r: _regs) {
 4079         r = *it;
 4080         ++it;
 4081       }
 4082     }
 4083 
 4084     void gen_loads(Register base) {
 4085       for (int i = 0; i < 8; i += 2) {
 4086         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 4087       }
 4088     }
 4089 
 4090     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 4091     void extract_u32(Register dest, int i) {
 4092       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 4093     }
 4094   };
 4095 
 4096   // Utility routines for md5.
 4097   // Clobbers r10 and r11.
 4098   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4099               int k, int s, int t) {
 4100     Register rscratch3 = r10;
 4101     Register rscratch4 = r11;
 4102 
 4103     __ eorw(rscratch3, r3, r4);
 4104     __ movw(rscratch2, t);
 4105     __ andw(rscratch3, rscratch3, r2);
 4106     __ addw(rscratch4, r1, rscratch2);
 4107     reg_cache.extract_u32(rscratch1, k);
 4108     __ eorw(rscratch3, rscratch3, r4);
 4109     __ addw(rscratch4, rscratch4, rscratch1);
 4110     __ addw(rscratch3, rscratch3, rscratch4);
 4111     __ rorw(rscratch2, rscratch3, 32 - s);
 4112     __ addw(r1, rscratch2, r2);
 4113   }
 4114 
 4115   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4116               int k, int s, int t) {
 4117     Register rscratch3 = r10;
 4118     Register rscratch4 = r11;
 4119 
 4120     reg_cache.extract_u32(rscratch1, k);
 4121     __ movw(rscratch2, t);
 4122     __ addw(rscratch4, r1, rscratch2);
 4123     __ addw(rscratch4, rscratch4, rscratch1);
 4124     __ bicw(rscratch2, r3, r4);
 4125     __ andw(rscratch3, r2, r4);
 4126     __ addw(rscratch2, rscratch2, rscratch4);
 4127     __ addw(rscratch2, rscratch2, rscratch3);
 4128     __ rorw(rscratch2, rscratch2, 32 - s);
 4129     __ addw(r1, rscratch2, r2);
 4130   }
 4131 
 4132   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4133               int k, int s, int t) {
 4134     Register rscratch3 = r10;
 4135     Register rscratch4 = r11;
 4136 
 4137     __ eorw(rscratch3, r3, r4);
 4138     __ movw(rscratch2, t);
 4139     __ addw(rscratch4, r1, rscratch2);
 4140     reg_cache.extract_u32(rscratch1, k);
 4141     __ eorw(rscratch3, rscratch3, r2);
 4142     __ addw(rscratch4, rscratch4, rscratch1);
 4143     __ addw(rscratch3, rscratch3, rscratch4);
 4144     __ rorw(rscratch2, rscratch3, 32 - s);
 4145     __ addw(r1, rscratch2, r2);
 4146   }
 4147 
 4148   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4149               int k, int s, int t) {
 4150     Register rscratch3 = r10;
 4151     Register rscratch4 = r11;
 4152 
 4153     __ movw(rscratch3, t);
 4154     __ ornw(rscratch2, r2, r4);
 4155     __ addw(rscratch4, r1, rscratch3);
 4156     reg_cache.extract_u32(rscratch1, k);
 4157     __ eorw(rscratch3, rscratch2, r3);
 4158     __ addw(rscratch4, rscratch4, rscratch1);
 4159     __ addw(rscratch3, rscratch3, rscratch4);
 4160     __ rorw(rscratch2, rscratch3, 32 - s);
 4161     __ addw(r1, rscratch2, r2);
 4162   }
 4163 
 4164   // Arguments:
 4165   //
 4166   // Inputs:
 4167   //   c_rarg0   - byte[]  source+offset
 4168   //   c_rarg1   - int[]   SHA.state
 4169   //   c_rarg2   - int     offset
 4170   //   c_rarg3   - int     limit
 4171   //
 4172   address generate_md5_implCompress(StubId stub_id) {
 4173     bool multi_block;
 4174     switch (stub_id) {
 4175     case StubId::stubgen_md5_implCompress_id:
 4176       multi_block = false;
 4177       break;
 4178     case StubId::stubgen_md5_implCompressMB_id:
 4179       multi_block = true;
 4180       break;
 4181     default:
 4182       ShouldNotReachHere();
 4183     }
 4184     int entry_count = StubInfo::entry_count(stub_id);
 4185     assert(entry_count == 1, "sanity check");
 4186     address start = load_archive_data(stub_id);
 4187     if (start != nullptr) {
 4188       return start;
 4189     }
 4190     __ align(CodeEntryAlignment);
 4191 
 4192     StubCodeMark mark(this, stub_id);
 4193     start = __ pc();
 4194 
 4195     Register buf       = c_rarg0;
 4196     Register state     = c_rarg1;
 4197     Register ofs       = c_rarg2;
 4198     Register limit     = c_rarg3;
 4199     Register a         = r4;
 4200     Register b         = r5;
 4201     Register c         = r6;
 4202     Register d         = r7;
 4203     Register rscratch3 = r10;
 4204     Register rscratch4 = r11;
 4205 
 4206     Register state_regs[2] = { r12, r13 };
 4207     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 4208     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 4209 
 4210     __ push(saved_regs, sp);
 4211 
 4212     __ ldp(state_regs[0], state_regs[1], Address(state));
 4213     __ ubfx(a, state_regs[0],  0, 32);
 4214     __ ubfx(b, state_regs[0], 32, 32);
 4215     __ ubfx(c, state_regs[1],  0, 32);
 4216     __ ubfx(d, state_regs[1], 32, 32);
 4217 
 4218     Label md5_loop;
 4219     __ BIND(md5_loop);
 4220 
 4221     reg_cache.gen_loads(buf);
 4222 
 4223     // Round 1
 4224     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 4225     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 4226     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 4227     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 4228     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 4229     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 4230     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 4231     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 4232     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 4233     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 4234     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 4235     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 4236     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 4237     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 4238     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 4239     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 4240 
 4241     // Round 2
 4242     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 4243     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 4244     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 4245     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 4246     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 4247     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 4248     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 4249     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 4250     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 4251     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 4252     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 4253     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 4254     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 4255     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 4256     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 4257     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 4258 
 4259     // Round 3
 4260     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 4261     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 4262     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 4263     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 4264     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 4265     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 4266     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 4267     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 4268     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 4269     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 4270     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 4271     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 4272     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 4273     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 4274     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 4275     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 4276 
 4277     // Round 4
 4278     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 4279     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 4280     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 4281     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 4282     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 4283     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 4284     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 4285     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 4286     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 4287     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 4288     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 4289     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 4290     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 4291     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 4292     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 4293     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 4294 
 4295     __ addw(a, state_regs[0], a);
 4296     __ ubfx(rscratch2, state_regs[0], 32, 32);
 4297     __ addw(b, rscratch2, b);
 4298     __ addw(c, state_regs[1], c);
 4299     __ ubfx(rscratch4, state_regs[1], 32, 32);
 4300     __ addw(d, rscratch4, d);
 4301 
 4302     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 4303     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 4304 
 4305     if (multi_block) {
 4306       __ add(buf, buf, 64);
 4307       __ add(ofs, ofs, 64);
 4308       __ cmp(ofs, limit);
 4309       __ br(Assembler::LE, md5_loop);
 4310       __ mov(c_rarg0, ofs); // return ofs
 4311     }
 4312 
 4313     // write hash values back in the correct order
 4314     __ stp(state_regs[0], state_regs[1], Address(state));
 4315 
 4316     __ pop(saved_regs, sp);
 4317 
 4318     __ ret(lr);
 4319 
 4320     // record the stub entry and end
 4321     store_archive_data(stub_id, start, __ pc());
 4322 
 4323     return start;
 4324   }
 4325 
 4326   // Arguments:
 4327   //
 4328   // Inputs:
 4329   //   c_rarg0   - byte[]  source+offset
 4330   //   c_rarg1   - int[]   SHA.state
 4331   //   c_rarg2   - int     offset
 4332   //   c_rarg3   - int     limit
 4333   //
 4334   address generate_sha1_implCompress(StubId stub_id) {
 4335     bool multi_block;
 4336     switch (stub_id) {
 4337     case StubId::stubgen_sha1_implCompress_id:
 4338       multi_block = false;
 4339       break;
 4340     case StubId::stubgen_sha1_implCompressMB_id:
 4341       multi_block = true;
 4342       break;
 4343     default:
 4344       ShouldNotReachHere();
 4345     }
 4346     int entry_count = StubInfo::entry_count(stub_id);
 4347     assert(entry_count == 1, "sanity check");
 4348     address start = load_archive_data(stub_id);
 4349     if (start != nullptr) {
 4350       return start;
 4351     }
 4352     __ align(CodeEntryAlignment);
 4353 
 4354     StubCodeMark mark(this, stub_id);
 4355     start = __ pc();
 4356 
 4357     Register buf   = c_rarg0;
 4358     Register state = c_rarg1;
 4359     Register ofs   = c_rarg2;
 4360     Register limit = c_rarg3;
 4361 
 4362     Label keys;
 4363     Label sha1_loop;
 4364 
 4365     // load the keys into v0..v3
 4366     __ adr(rscratch1, keys);
 4367     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 4368     // load 5 words state into v6, v7
 4369     __ ldrq(v6, Address(state, 0));
 4370     __ ldrs(v7, Address(state, 16));
 4371 
 4372 
 4373     __ BIND(sha1_loop);
 4374     // load 64 bytes of data into v16..v19
 4375     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4376     __ rev32(v16, __ T16B, v16);
 4377     __ rev32(v17, __ T16B, v17);
 4378     __ rev32(v18, __ T16B, v18);
 4379     __ rev32(v19, __ T16B, v19);
 4380 
 4381     // do the sha1
 4382     __ addv(v4, __ T4S, v16, v0);
 4383     __ orr(v20, __ T16B, v6, v6);
 4384 
 4385     FloatRegister d0 = v16;
 4386     FloatRegister d1 = v17;
 4387     FloatRegister d2 = v18;
 4388     FloatRegister d3 = v19;
 4389 
 4390     for (int round = 0; round < 20; round++) {
 4391       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 4392       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 4393       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 4394       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 4395       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 4396 
 4397       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 4398       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 4399       __ sha1h(tmp2, __ T4S, v20);
 4400       if (round < 5)
 4401         __ sha1c(v20, __ T4S, tmp3, tmp4);
 4402       else if (round < 10 || round >= 15)
 4403         __ sha1p(v20, __ T4S, tmp3, tmp4);
 4404       else
 4405         __ sha1m(v20, __ T4S, tmp3, tmp4);
 4406       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 4407 
 4408       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4409     }
 4410 
 4411     __ addv(v7, __ T2S, v7, v21);
 4412     __ addv(v6, __ T4S, v6, v20);
 4413 
 4414     if (multi_block) {
 4415       __ add(ofs, ofs, 64);
 4416       __ cmp(ofs, limit);
 4417       __ br(Assembler::LE, sha1_loop);
 4418       __ mov(c_rarg0, ofs); // return ofs
 4419     }
 4420 
 4421     __ strq(v6, Address(state, 0));
 4422     __ strs(v7, Address(state, 16));
 4423 
 4424     __ ret(lr);
 4425 
 4426     __ bind(keys);
 4427     __ emit_int32(0x5a827999);
 4428     __ emit_int32(0x6ed9eba1);
 4429     __ emit_int32(0x8f1bbcdc);
 4430     __ emit_int32(0xca62c1d6);
 4431 
 4432     // record the stub entry and end
 4433     store_archive_data(stub_id, start, __ pc());
 4434 
 4435     return start;
 4436   }
 4437 
 4438 
 4439   // Arguments:
 4440   //
 4441   // Inputs:
 4442   //   c_rarg0   - byte[]  source+offset
 4443   //   c_rarg1   - int[]   SHA.state
 4444   //   c_rarg2   - int     offset
 4445   //   c_rarg3   - int     limit
 4446   //
 4447   address generate_sha256_implCompress(StubId stub_id) {
 4448     bool multi_block;
 4449     switch (stub_id) {
 4450     case StubId::stubgen_sha256_implCompress_id:
 4451       multi_block = false;
 4452       break;
 4453     case StubId::stubgen_sha256_implCompressMB_id:
 4454       multi_block = true;
 4455       break;
 4456     default:
 4457       ShouldNotReachHere();
 4458     }
 4459     int entry_count = StubInfo::entry_count(stub_id);
 4460     assert(entry_count == 1, "sanity check");
 4461     address start = load_archive_data(stub_id);
 4462     if (start != nullptr) {
 4463       return start;
 4464     }
 4465     __ align(CodeEntryAlignment);
 4466     StubCodeMark mark(this, stub_id);
 4467     start = __ pc();
 4468 
 4469     Register buf   = c_rarg0;
 4470     Register state = c_rarg1;
 4471     Register ofs   = c_rarg2;
 4472     Register limit = c_rarg3;
 4473 
 4474     Label sha1_loop;
 4475 
 4476     __ stpd(v8, v9, __ pre(sp, -32));
 4477     __ stpd(v10, v11, Address(sp, 16));
 4478 
 4479 // dga == v0
 4480 // dgb == v1
 4481 // dg0 == v2
 4482 // dg1 == v3
 4483 // dg2 == v4
 4484 // t0 == v6
 4485 // t1 == v7
 4486 
 4487     // load 16 keys to v16..v31
 4488     __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
 4489     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 4490     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 4491     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 4492     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 4493 
 4494     // load 8 words (256 bits) state
 4495     __ ldpq(v0, v1, state);
 4496 
 4497     __ BIND(sha1_loop);
 4498     // load 64 bytes of data into v8..v11
 4499     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4500     __ rev32(v8, __ T16B, v8);
 4501     __ rev32(v9, __ T16B, v9);
 4502     __ rev32(v10, __ T16B, v10);
 4503     __ rev32(v11, __ T16B, v11);
 4504 
 4505     __ addv(v6, __ T4S, v8, v16);
 4506     __ orr(v2, __ T16B, v0, v0);
 4507     __ orr(v3, __ T16B, v1, v1);
 4508 
 4509     FloatRegister d0 = v8;
 4510     FloatRegister d1 = v9;
 4511     FloatRegister d2 = v10;
 4512     FloatRegister d3 = v11;
 4513 
 4514 
 4515     for (int round = 0; round < 16; round++) {
 4516       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4517       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4518       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4519       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4520 
 4521       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4522        __ orr(v4, __ T16B, v2, v2);
 4523       if (round < 15)
 4524         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4525       __ sha256h(v2, __ T4S, v3, tmp2);
 4526       __ sha256h2(v3, __ T4S, v4, tmp2);
 4527       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4528 
 4529       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4530     }
 4531 
 4532     __ addv(v0, __ T4S, v0, v2);
 4533     __ addv(v1, __ T4S, v1, v3);
 4534 
 4535     if (multi_block) {
 4536       __ add(ofs, ofs, 64);
 4537       __ cmp(ofs, limit);
 4538       __ br(Assembler::LE, sha1_loop);
 4539       __ mov(c_rarg0, ofs); // return ofs
 4540     }
 4541 
 4542     __ ldpd(v10, v11, Address(sp, 16));
 4543     __ ldpd(v8, v9, __ post(sp, 32));
 4544 
 4545     __ stpq(v0, v1, state);
 4546 
 4547     __ ret(lr);
 4548 
 4549     // record the stub entry and end
 4550     store_archive_data(stub_id, start, __ pc());
 4551 
 4552     return start;
 4553   }
 4554 
 4555   // Double rounds for sha512.
 4556   void sha512_dround(int dr,
 4557                      FloatRegister vi0, FloatRegister vi1,
 4558                      FloatRegister vi2, FloatRegister vi3,
 4559                      FloatRegister vi4, FloatRegister vrc0,
 4560                      FloatRegister vrc1, FloatRegister vin0,
 4561                      FloatRegister vin1, FloatRegister vin2,
 4562                      FloatRegister vin3, FloatRegister vin4) {
 4563       if (dr < 36) {
 4564         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4565       }
 4566       __ addv(v5, __ T2D, vrc0, vin0);
 4567       __ ext(v6, __ T16B, vi2, vi3, 8);
 4568       __ ext(v5, __ T16B, v5, v5, 8);
 4569       __ ext(v7, __ T16B, vi1, vi2, 8);
 4570       __ addv(vi3, __ T2D, vi3, v5);
 4571       if (dr < 32) {
 4572         __ ext(v5, __ T16B, vin3, vin4, 8);
 4573         __ sha512su0(vin0, __ T2D, vin1);
 4574       }
 4575       __ sha512h(vi3, __ T2D, v6, v7);
 4576       if (dr < 32) {
 4577         __ sha512su1(vin0, __ T2D, vin2, v5);
 4578       }
 4579       __ addv(vi4, __ T2D, vi1, vi3);
 4580       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4581   }
 4582 
 4583   // Arguments:
 4584   //
 4585   // Inputs:
 4586   //   c_rarg0   - byte[]  source+offset
 4587   //   c_rarg1   - int[]   SHA.state
 4588   //   c_rarg2   - int     offset
 4589   //   c_rarg3   - int     limit
 4590   //
 4591   address generate_sha512_implCompress(StubId stub_id) {
 4592     bool multi_block;
 4593     switch (stub_id) {
 4594     case StubId::stubgen_sha512_implCompress_id:
 4595       multi_block = false;
 4596       break;
 4597     case StubId::stubgen_sha512_implCompressMB_id:
 4598       multi_block = true;
 4599       break;
 4600     default:
 4601       ShouldNotReachHere();
 4602     }
 4603     int entry_count = StubInfo::entry_count(stub_id);
 4604     assert(entry_count == 1, "sanity check");
 4605     address start = load_archive_data(stub_id);
 4606     if (start != nullptr) {
 4607       return start;
 4608     }
 4609     __ align(CodeEntryAlignment);
 4610     StubCodeMark mark(this, stub_id);
 4611     start = __ pc();
 4612 
 4613     Register buf   = c_rarg0;
 4614     Register state = c_rarg1;
 4615     Register ofs   = c_rarg2;
 4616     Register limit = c_rarg3;
 4617 
 4618     __ stpd(v8, v9, __ pre(sp, -64));
 4619     __ stpd(v10, v11, Address(sp, 16));
 4620     __ stpd(v12, v13, Address(sp, 32));
 4621     __ stpd(v14, v15, Address(sp, 48));
 4622 
 4623     Label sha512_loop;
 4624 
 4625     // load state
 4626     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4627 
 4628     // load first 4 round constants
 4629     __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
 4630     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4631 
 4632     __ BIND(sha512_loop);
 4633     // load 128B of data into v12..v19
 4634     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4635     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4636     __ rev64(v12, __ T16B, v12);
 4637     __ rev64(v13, __ T16B, v13);
 4638     __ rev64(v14, __ T16B, v14);
 4639     __ rev64(v15, __ T16B, v15);
 4640     __ rev64(v16, __ T16B, v16);
 4641     __ rev64(v17, __ T16B, v17);
 4642     __ rev64(v18, __ T16B, v18);
 4643     __ rev64(v19, __ T16B, v19);
 4644 
 4645     __ mov(rscratch2, rscratch1);
 4646 
 4647     __ mov(v0, __ T16B, v8);
 4648     __ mov(v1, __ T16B, v9);
 4649     __ mov(v2, __ T16B, v10);
 4650     __ mov(v3, __ T16B, v11);
 4651 
 4652     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4653     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4654     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4655     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4656     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4657     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4658     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4659     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4660     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4661     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4662     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4663     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4664     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4665     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4666     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4667     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4668     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4669     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4670     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4671     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4672     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4673     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4674     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4675     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4676     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4677     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4678     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4679     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4680     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4681     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4682     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4683     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4684     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4685     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4686     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4687     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4688     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4689     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4690     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4691     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4692 
 4693     __ addv(v8, __ T2D, v8, v0);
 4694     __ addv(v9, __ T2D, v9, v1);
 4695     __ addv(v10, __ T2D, v10, v2);
 4696     __ addv(v11, __ T2D, v11, v3);
 4697 
 4698     if (multi_block) {
 4699       __ add(ofs, ofs, 128);
 4700       __ cmp(ofs, limit);
 4701       __ br(Assembler::LE, sha512_loop);
 4702       __ mov(c_rarg0, ofs); // return ofs
 4703     }
 4704 
 4705     __ st1(v8, v9, v10, v11, __ T2D, state);
 4706 
 4707     __ ldpd(v14, v15, Address(sp, 48));
 4708     __ ldpd(v12, v13, Address(sp, 32));
 4709     __ ldpd(v10, v11, Address(sp, 16));
 4710     __ ldpd(v8, v9, __ post(sp, 64));
 4711 
 4712     __ ret(lr);
 4713 
 4714     // record the stub entry and end
 4715     store_archive_data(stub_id, start, __ pc());
 4716 
 4717     return start;
 4718   }
 4719 
 4720   // Execute one round of keccak of two computations in parallel.
 4721   // One of the states should be loaded into the lower halves of
 4722   // the vector registers v0-v24, the other should be loaded into
 4723   // the upper halves of those registers. The ld1r instruction loads
 4724   // the round constant into both halves of register v31.
 4725   // Intermediate results c0...c5 and d0...d5 are computed
 4726   // in registers v25...v30.
 4727   // All vector instructions that are used operate on both register
 4728   // halves in parallel.
 4729   // If only a single computation is needed, one can only load the lower halves.
 4730   void keccak_round(Register rscratch1) {
 4731   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4732   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4733   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4734   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4735   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4736   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4737   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4738   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4739   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4740   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4741 
 4742   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4743   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4744   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4745   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4746   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4747 
 4748   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4749   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4750   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4751   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4752   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4753   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4754   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4755   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4756   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4757   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4758   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4759   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4760   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4761   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4762   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4763   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4764   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4765   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4766   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4767   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4768   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4769   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4770   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4771   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4772   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4773 
 4774   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4775   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4776   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4777   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4778   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4779 
 4780   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4781 
 4782   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4783   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4784   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4785   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4786   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4787 
 4788   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4789   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4790   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4791   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4792   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4793 
 4794   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4795   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4796   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4797   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4798   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4799 
 4800   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4801   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4802   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4803   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4804   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4805 
 4806   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4807   }
 4808 
 4809   // Arguments:
 4810   //
 4811   // Inputs:
 4812   //   c_rarg0   - byte[]  source+offset
 4813   //   c_rarg1   - byte[]  SHA.state
 4814   //   c_rarg2   - int     block_size
 4815   //   c_rarg3   - int     offset
 4816   //   c_rarg4   - int     limit
 4817   //
 4818   address generate_sha3_implCompress(StubId stub_id) {
 4819     bool multi_block;
 4820     switch (stub_id) {
 4821     case StubId::stubgen_sha3_implCompress_id:
 4822       multi_block = false;
 4823       break;
 4824     case StubId::stubgen_sha3_implCompressMB_id:
 4825       multi_block = true;
 4826       break;
 4827     default:
 4828       ShouldNotReachHere();
 4829     }
 4830     int entry_count = StubInfo::entry_count(stub_id);
 4831     assert(entry_count == 1, "sanity check");
 4832     address start = load_archive_data(stub_id);
 4833     if (start != nullptr) {
 4834       return start;
 4835     }
 4836     __ align(CodeEntryAlignment);
 4837     StubCodeMark mark(this, stub_id);
 4838     start = __ pc();
 4839 
 4840     Register buf           = c_rarg0;
 4841     Register state         = c_rarg1;
 4842     Register block_size    = c_rarg2;
 4843     Register ofs           = c_rarg3;
 4844     Register limit         = c_rarg4;
 4845 
 4846     Label sha3_loop, rounds24_loop;
 4847     Label sha3_512_or_sha3_384, shake128;
 4848 
 4849     __ stpd(v8, v9, __ pre(sp, -64));
 4850     __ stpd(v10, v11, Address(sp, 16));
 4851     __ stpd(v12, v13, Address(sp, 32));
 4852     __ stpd(v14, v15, Address(sp, 48));
 4853 
 4854     // load state
 4855     __ add(rscratch1, state, 32);
 4856     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4857     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4858     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4859     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4860     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4861     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4862     __ ld1(v24, __ T1D, rscratch1);
 4863 
 4864     __ BIND(sha3_loop);
 4865 
 4866     // 24 keccak rounds
 4867     __ movw(rscratch2, 24);
 4868 
 4869     // load round_constants base
 4870     __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
 4871 
 4872     // load input
 4873     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4874     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4875     __ eor(v0, __ T8B, v0, v25);
 4876     __ eor(v1, __ T8B, v1, v26);
 4877     __ eor(v2, __ T8B, v2, v27);
 4878     __ eor(v3, __ T8B, v3, v28);
 4879     __ eor(v4, __ T8B, v4, v29);
 4880     __ eor(v5, __ T8B, v5, v30);
 4881     __ eor(v6, __ T8B, v6, v31);
 4882 
 4883     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4884     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4885 
 4886     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4887     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4888     __ eor(v7, __ T8B, v7, v25);
 4889     __ eor(v8, __ T8B, v8, v26);
 4890     __ eor(v9, __ T8B, v9, v27);
 4891     __ eor(v10, __ T8B, v10, v28);
 4892     __ eor(v11, __ T8B, v11, v29);
 4893     __ eor(v12, __ T8B, v12, v30);
 4894     __ eor(v13, __ T8B, v13, v31);
 4895 
 4896     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4897     __ eor(v14, __ T8B, v14, v25);
 4898     __ eor(v15, __ T8B, v15, v26);
 4899     __ eor(v16, __ T8B, v16, v27);
 4900 
 4901     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4902     __ andw(c_rarg5, block_size, 48);
 4903     __ cbzw(c_rarg5, rounds24_loop);
 4904 
 4905     __ tbnz(block_size, 5, shake128);
 4906     // block_size == 144, bit5 == 0, SHA3-224
 4907     __ ldrd(v28, __ post(buf, 8));
 4908     __ eor(v17, __ T8B, v17, v28);
 4909     __ b(rounds24_loop);
 4910 
 4911     __ BIND(shake128);
 4912     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4913     __ eor(v17, __ T8B, v17, v28);
 4914     __ eor(v18, __ T8B, v18, v29);
 4915     __ eor(v19, __ T8B, v19, v30);
 4916     __ eor(v20, __ T8B, v20, v31);
 4917     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4918 
 4919     __ BIND(sha3_512_or_sha3_384);
 4920     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4921     __ eor(v7, __ T8B, v7, v25);
 4922     __ eor(v8, __ T8B, v8, v26);
 4923     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4924 
 4925     // SHA3-384
 4926     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4927     __ eor(v9,  __ T8B, v9,  v27);
 4928     __ eor(v10, __ T8B, v10, v28);
 4929     __ eor(v11, __ T8B, v11, v29);
 4930     __ eor(v12, __ T8B, v12, v30);
 4931 
 4932     __ BIND(rounds24_loop);
 4933     __ subw(rscratch2, rscratch2, 1);
 4934 
 4935     keccak_round(rscratch1);
 4936 
 4937     __ cbnzw(rscratch2, rounds24_loop);
 4938 
 4939     if (multi_block) {
 4940       __ add(ofs, ofs, block_size);
 4941       __ cmp(ofs, limit);
 4942       __ br(Assembler::LE, sha3_loop);
 4943       __ mov(c_rarg0, ofs); // return ofs
 4944     }
 4945 
 4946     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4947     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4948     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4949     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4950     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4951     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4952     __ st1(v24, __ T1D, state);
 4953 
 4954     // restore callee-saved registers
 4955     __ ldpd(v14, v15, Address(sp, 48));
 4956     __ ldpd(v12, v13, Address(sp, 32));
 4957     __ ldpd(v10, v11, Address(sp, 16));
 4958     __ ldpd(v8, v9, __ post(sp, 64));
 4959 
 4960     __ ret(lr);
 4961 
 4962     // record the stub entry and end
 4963     store_archive_data(stub_id, start, __ pc());
 4964 
 4965     return start;
 4966   }
 4967 
 4968   // Inputs:
 4969   //   c_rarg0   - long[]  state0
 4970   //   c_rarg1   - long[]  state1
 4971   address generate_double_keccak() {
 4972     StubId stub_id = StubId::stubgen_double_keccak_id;
 4973     int entry_count = StubInfo::entry_count(stub_id);
 4974     assert(entry_count == 1, "sanity check");
 4975     address start = load_archive_data(stub_id);
 4976     if (start != nullptr) {
 4977       return start;
 4978     }
 4979     // Implements the double_keccak() method of the
 4980     // sun.secyrity.provider.SHA3Parallel class
 4981     __ align(CodeEntryAlignment);
 4982     StubCodeMark mark(this, stub_id);
 4983     start = __ pc();
 4984     __ enter();
 4985 
 4986     Register state0        = c_rarg0;
 4987     Register state1        = c_rarg1;
 4988 
 4989     Label rounds24_loop;
 4990 
 4991     // save callee-saved registers
 4992     __ stpd(v8, v9, __ pre(sp, -64));
 4993     __ stpd(v10, v11, Address(sp, 16));
 4994     __ stpd(v12, v13, Address(sp, 32));
 4995     __ stpd(v14, v15, Address(sp, 48));
 4996 
 4997     // load states
 4998     __ add(rscratch1, state0, 32);
 4999     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 5000     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 5001     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 5002     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 5003     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 5004     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 5005     __ ld1(v24, __ D, 0, rscratch1);
 5006     __ add(rscratch1, state1, 32);
 5007     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 5008     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 5009     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 5010     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 5011     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 5012     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 5013     __ ld1(v24, __ D, 1, rscratch1);
 5014 
 5015     // 24 keccak rounds
 5016     __ movw(rscratch2, 24);
 5017 
 5018     // load round_constants base
 5019     __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
 5020 
 5021     __ BIND(rounds24_loop);
 5022     __ subw(rscratch2, rscratch2, 1);
 5023     keccak_round(rscratch1);
 5024     __ cbnzw(rscratch2, rounds24_loop);
 5025 
 5026     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 5027     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 5028     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 5029     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 5030     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 5031     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 5032     __ st1(v24, __ D, 0, state0);
 5033     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 5034     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 5035     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 5036     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 5037     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 5038     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 5039     __ st1(v24, __ D, 1, state1);
 5040 
 5041     // restore callee-saved vector registers
 5042     __ ldpd(v14, v15, Address(sp, 48));
 5043     __ ldpd(v12, v13, Address(sp, 32));
 5044     __ ldpd(v10, v11, Address(sp, 16));
 5045     __ ldpd(v8, v9, __ post(sp, 64));
 5046 
 5047     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5048     __ mov(r0, zr); // return 0
 5049     __ ret(lr);
 5050 
 5051     // record the stub entry and end
 5052     store_archive_data(stub_id, start, __ pc());
 5053 
 5054     return start;
 5055   }
 5056 
 5057   // ChaCha20 block function.  This version parallelizes the 32-bit
 5058   // state elements on each of 16 vectors, producing 4 blocks of
 5059   // keystream at a time.
 5060   //
 5061   // state (int[16]) = c_rarg0
 5062   // keystream (byte[256]) = c_rarg1
 5063   // return - number of bytes of produced keystream (always 256)
 5064   //
 5065   // This implementation takes each 32-bit integer from the state
 5066   // array and broadcasts it across all 4 32-bit lanes of a vector register
 5067   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 5068   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 5069   // the quarter round schedule is implemented as outlined in RFC 7539 section
 5070   // 2.3.  However, instead of sequentially processing the 3 quarter round
 5071   // operations represented by one QUARTERROUND function, we instead stack all
 5072   // the adds, xors and left-rotations from the first 4 quarter rounds together
 5073   // and then do the same for the second set of 4 quarter rounds.  This removes
 5074   // some latency that would otherwise be incurred by waiting for an add to
 5075   // complete before performing an xor (which depends on the result of the
 5076   // add), etc. An adjustment happens between the first and second groups of 4
 5077   // quarter rounds, but this is done only in the inputs to the macro functions
 5078   // that generate the assembly instructions - these adjustments themselves are
 5079   // not part of the resulting assembly.
 5080   // The 4 registers v0-v3 are used during the quarter round operations as
 5081   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 5082   // registers become the vectors involved in adding the start state back onto
 5083   // the post-QR working state.  After the adds are complete, each of the 16
 5084   // vectors write their first lane back to the keystream buffer, followed
 5085   // by the second lane from all vectors and so on.
 5086   address generate_chacha20Block_blockpar() {
 5087     StubId stub_id = StubId::stubgen_chacha20Block_id;
 5088     int entry_count = StubInfo::entry_count(stub_id);
 5089     assert(entry_count == 1, "sanity check");
 5090     address start = load_archive_data(stub_id);
 5091     if (start != nullptr) {
 5092       return start;
 5093     }
 5094     Label L_twoRounds, L_cc20_const;
 5095     __ align(CodeEntryAlignment);
 5096     StubCodeMark mark(this, stub_id);
 5097     start = __ pc();
 5098     __ enter();
 5099 
 5100     int i, j;
 5101     const Register state = c_rarg0;
 5102     const Register keystream = c_rarg1;
 5103     const Register loopCtr = r10;
 5104     const Register tmpAddr = r11;
 5105     const FloatRegister ctrAddOverlay = v28;
 5106     const FloatRegister lrot8Tbl = v29;
 5107 
 5108     // Organize SIMD registers in an array that facilitates
 5109     // putting repetitive opcodes into loop structures.  It is
 5110     // important that each grouping of 4 registers is monotonically
 5111     // increasing to support the requirements of multi-register
 5112     // instructions (e.g. ld4r, st4, etc.)
 5113     const FloatRegister workSt[16] = {
 5114          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 5115         v20, v21, v22, v23, v24, v25, v26, v27
 5116     };
 5117 
 5118     // Pull in constant data.  The first 16 bytes are the add overlay
 5119     // which is applied to the vector holding the counter (state[12]).
 5120     // The second 16 bytes is the index register for the 8-bit left
 5121     // rotation tbl instruction.
 5122     __ adr(tmpAddr, L_cc20_const);
 5123     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 5124 
 5125     // Load from memory and interlace across 16 SIMD registers,
 5126     // With each word from memory being broadcast to all lanes of
 5127     // each successive SIMD register.
 5128     //      Addr(0) -> All lanes in workSt[i]
 5129     //      Addr(4) -> All lanes workSt[i + 1], etc.
 5130     __ mov(tmpAddr, state);
 5131     for (i = 0; i < 16; i += 4) {
 5132       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 5133           __ post(tmpAddr, 16));
 5134     }
 5135     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5136 
 5137     // Before entering the loop, create 5 4-register arrays.  These
 5138     // will hold the 4 registers that represent the a/b/c/d fields
 5139     // in the quarter round operation.  For instance the "b" field
 5140     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 5141     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 5142     // since it is part of a diagonal organization.  The aSet and scratch
 5143     // register sets are defined at declaration time because they do not change
 5144     // organization at any point during the 20-round processing.
 5145     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 5146     FloatRegister bSet[4];
 5147     FloatRegister cSet[4];
 5148     FloatRegister dSet[4];
 5149     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 5150 
 5151     // Set up the 10 iteration loop and perform all 8 quarter round ops
 5152     __ mov(loopCtr, 10);
 5153     __ BIND(L_twoRounds);
 5154 
 5155     // Set to columnar organization and do the following 4 quarter-rounds:
 5156     // QUARTERROUND(0, 4, 8, 12)
 5157     // QUARTERROUND(1, 5, 9, 13)
 5158     // QUARTERROUND(2, 6, 10, 14)
 5159     // QUARTERROUND(3, 7, 11, 15)
 5160     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 5161     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 5162     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 5163 
 5164     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5165     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5166     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5167 
 5168     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5169     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5170     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5171 
 5172     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5173     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5174     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5175 
 5176     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5177     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5178     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5179 
 5180     // Set to diagonal organization and do the next 4 quarter-rounds:
 5181     // QUARTERROUND(0, 5, 10, 15)
 5182     // QUARTERROUND(1, 6, 11, 12)
 5183     // QUARTERROUND(2, 7, 8, 13)
 5184     // QUARTERROUND(3, 4, 9, 14)
 5185     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 5186     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 5187     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 5188 
 5189     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5190     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5191     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5192 
 5193     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5194     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5195     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5196 
 5197     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5198     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5199     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5200 
 5201     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5202     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5203     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5204 
 5205     // Decrement and iterate
 5206     __ sub(loopCtr, loopCtr, 1);
 5207     __ cbnz(loopCtr, L_twoRounds);
 5208 
 5209     __ mov(tmpAddr, state);
 5210 
 5211     // Add the starting state back to the post-loop keystream
 5212     // state.  We read/interlace the state array from memory into
 5213     // 4 registers similar to what we did in the beginning.  Then
 5214     // add the counter overlay onto workSt[12] at the end.
 5215     for (i = 0; i < 16; i += 4) {
 5216       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 5217       __ addv(workSt[i], __ T4S, workSt[i], v0);
 5218       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 5219       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 5220       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 5221     }
 5222     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5223 
 5224     // Write working state into the keystream buffer.  This is accomplished
 5225     // by taking the lane "i" from each of the four vectors and writing
 5226     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 5227     // repeating with the next 4 vectors until all 16 vectors have been used.
 5228     // Then move to the next lane and repeat the process until all lanes have
 5229     // been written.
 5230     for (i = 0; i < 4; i++) {
 5231       for (j = 0; j < 16; j += 4) {
 5232         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 5233             __ post(keystream, 16));
 5234       }
 5235     }
 5236 
 5237     __ mov(r0, 256);             // Return length of output keystream
 5238     __ leave();
 5239     __ ret(lr);
 5240 
 5241     // bind label and generate local constant data used by this stub
 5242     // The constant data is broken into two 128-bit segments to be loaded
 5243     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 5244     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 5245     // The second 128-bits is a table constant used for 8-bit left rotations.
 5246     __ BIND(L_cc20_const);
 5247     __ emit_int64(0x0000000100000000UL);
 5248     __ emit_int64(0x0000000300000002UL);
 5249     __ emit_int64(0x0605040702010003UL);
 5250     __ emit_int64(0x0E0D0C0F0A09080BUL);
 5251 
 5252     // record the stub entry and end
 5253     store_archive_data(stub_id, start, __ pc());
 5254 
 5255     return start;
 5256   }
 5257 
 5258   // Helpers to schedule parallel operation bundles across vector
 5259   // register sequences of size 2, 4 or 8.
 5260 
 5261   // Implement various primitive computations across vector sequences
 5262 
 5263   template<int N>
 5264   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5265                const VSeq<N>& v1, const VSeq<N>& v2) {
 5266     // output must not be constant
 5267     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5268     // output cannot overwrite pending inputs
 5269     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5270     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5271     for (int i = 0; i < N; i++) {
 5272       __ addv(v[i], T, v1[i], v2[i]);
 5273     }
 5274   }
 5275 
 5276   template<int N>
 5277   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5278                const VSeq<N>& v1, const VSeq<N>& v2) {
 5279     // output must not be constant
 5280     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5281     // output cannot overwrite pending inputs
 5282     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5283     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5284     for (int i = 0; i < N; i++) {
 5285       __ subv(v[i], T, v1[i], v2[i]);
 5286     }
 5287   }
 5288 
 5289   template<int N>
 5290   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5291                const VSeq<N>& v1, const VSeq<N>& v2) {
 5292     // output must not be constant
 5293     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5294     // output cannot overwrite pending inputs
 5295     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5296     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5297     for (int i = 0; i < N; i++) {
 5298       __ mulv(v[i], T, v1[i], v2[i]);
 5299     }
 5300   }
 5301 
 5302   template<int N>
 5303   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 5304     // output must not be constant
 5305     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5306     // output cannot overwrite pending inputs
 5307     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5308     for (int i = 0; i < N; i++) {
 5309       __ negr(v[i], T, v1[i]);
 5310     }
 5311   }
 5312 
 5313   template<int N>
 5314   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5315                const VSeq<N>& v1, int shift) {
 5316     // output must not be constant
 5317     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5318     // output cannot overwrite pending inputs
 5319     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5320     for (int i = 0; i < N; i++) {
 5321       __ sshr(v[i], T, v1[i], shift);
 5322     }
 5323   }
 5324 
 5325   template<int N>
 5326   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5327     // output must not be constant
 5328     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5329     // output cannot overwrite pending inputs
 5330     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5331     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5332     for (int i = 0; i < N; i++) {
 5333       __ andr(v[i], __ T16B, v1[i], v2[i]);
 5334     }
 5335   }
 5336 
 5337   template<int N>
 5338   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5339     // output must not be constant
 5340     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5341     // output cannot overwrite pending inputs
 5342     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5343     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5344     for (int i = 0; i < N; i++) {
 5345       __ orr(v[i], __ T16B, v1[i], v2[i]);
 5346     }
 5347   }
 5348 
 5349   template<int N>
 5350   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 5351     // output must not be constant
 5352     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5353     // output cannot overwrite pending inputs
 5354     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5355     for (int i = 0; i < N; i++) {
 5356       __ notr(v[i], __ T16B, v1[i]);
 5357     }
 5358   }
 5359 
 5360   template<int N>
 5361   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 5362     // output must not be constant
 5363     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5364     // output cannot overwrite pending inputs
 5365     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5366     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5367     for (int i = 0; i < N; i++) {
 5368       __ sqdmulh(v[i], T, v1[i], v2[i]);
 5369     }
 5370   }
 5371 
 5372   template<int N>
 5373   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 5374     // output must not be constant
 5375     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5376     // output cannot overwrite pending inputs
 5377     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5378     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5379     for (int i = 0; i < N; i++) {
 5380       __ mlsv(v[i], T, v1[i], v2[i]);
 5381     }
 5382   }
 5383 
 5384   // load N/2 successive pairs of quadword values from memory in order
 5385   // into N successive vector registers of the sequence via the
 5386   // address supplied in base.
 5387   template<int N>
 5388   void vs_ldpq(const VSeq<N>& v, Register base) {
 5389     for (int i = 0; i < N; i += 2) {
 5390       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 5391     }
 5392   }
 5393 
 5394   // load N/2 successive pairs of quadword values from memory in order
 5395   // into N vector registers of the sequence via the address supplied
 5396   // in base using post-increment addressing
 5397   template<int N>
 5398   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 5399     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5400     for (int i = 0; i < N; i += 2) {
 5401       __ ldpq(v[i], v[i+1], __ post(base, 32));
 5402     }
 5403   }
 5404 
 5405   // store N successive vector registers of the sequence into N/2
 5406   // successive pairs of quadword memory locations via the address
 5407   // supplied in base using post-increment addressing
 5408   template<int N>
 5409   void vs_stpq_post(const VSeq<N>& v, Register base) {
 5410     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5411     for (int i = 0; i < N; i += 2) {
 5412       __ stpq(v[i], v[i+1], __ post(base, 32));
 5413     }
 5414   }
 5415 
 5416   // load N/2 pairs of quadword values from memory de-interleaved into
 5417   // N vector registers 2 at a time via the address supplied in base
 5418   // using post-increment addressing.
 5419   template<int N>
 5420   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5421     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5422     for (int i = 0; i < N; i += 2) {
 5423       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 5424     }
 5425   }
 5426 
 5427   // store N vector registers interleaved into N/2 pairs of quadword
 5428   // memory locations via the address supplied in base using
 5429   // post-increment addressing.
 5430   template<int N>
 5431   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5432     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5433     for (int i = 0; i < N; i += 2) {
 5434       __ st2(v[i], v[i+1], T, __ post(base, 32));
 5435     }
 5436   }
 5437 
 5438   // load N quadword values from memory de-interleaved into N vector
 5439   // registers 3 elements at a time via the address supplied in base.
 5440   template<int N>
 5441   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5442     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5443     for (int i = 0; i < N; i += 3) {
 5444       __ ld3(v[i], v[i+1], v[i+2], T, base);
 5445     }
 5446   }
 5447 
 5448   // load N quadword values from memory de-interleaved into N vector
 5449   // registers 3 elements at a time via the address supplied in base
 5450   // using post-increment addressing.
 5451   template<int N>
 5452   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5453     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5454     for (int i = 0; i < N; i += 3) {
 5455       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 5456     }
 5457   }
 5458 
 5459   // load N/2 pairs of quadword values from memory into N vector
 5460   // registers via the address supplied in base with each pair indexed
 5461   // using the the start offset plus the corresponding entry in the
 5462   // offsets array
 5463   template<int N>
 5464   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 5465     for (int i = 0; i < N/2; i++) {
 5466       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5467     }
 5468   }
 5469 
 5470   // store N vector registers into N/2 pairs of quadword memory
 5471   // locations via the address supplied in base with each pair indexed
 5472   // using the the start offset plus the corresponding entry in the
 5473   // offsets array
 5474   template<int N>
 5475   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 5476     for (int i = 0; i < N/2; i++) {
 5477       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5478     }
 5479   }
 5480 
 5481   // load N single quadword values from memory into N vector registers
 5482   // via the address supplied in base with each value indexed using
 5483   // the the start offset plus the corresponding entry in the offsets
 5484   // array
 5485   template<int N>
 5486   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5487                       int start, int (&offsets)[N]) {
 5488     for (int i = 0; i < N; i++) {
 5489       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5490     }
 5491   }
 5492 
 5493   // store N vector registers into N single quadword memory locations
 5494   // via the address supplied in base with each value indexed using
 5495   // the the start offset plus the corresponding entry in the offsets
 5496   // array
 5497   template<int N>
 5498   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5499                       int start, int (&offsets)[N]) {
 5500     for (int i = 0; i < N; i++) {
 5501       __ str(v[i], T, Address(base, start + offsets[i]));
 5502     }
 5503   }
 5504 
 5505   // load N/2 pairs of quadword values from memory de-interleaved into
 5506   // N vector registers 2 at a time via the address supplied in base
 5507   // with each pair indexed using the the start offset plus the
 5508   // corresponding entry in the offsets array
 5509   template<int N>
 5510   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5511                       Register tmp, int start, int (&offsets)[N/2]) {
 5512     for (int i = 0; i < N/2; i++) {
 5513       __ add(tmp, base, start + offsets[i]);
 5514       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5515     }
 5516   }
 5517 
 5518   // store N vector registers 2 at a time interleaved into N/2 pairs
 5519   // of quadword memory locations via the address supplied in base
 5520   // with each pair indexed using the the start offset plus the
 5521   // corresponding entry in the offsets array
 5522   template<int N>
 5523   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5524                       Register tmp, int start, int (&offsets)[N/2]) {
 5525     for (int i = 0; i < N/2; i++) {
 5526       __ add(tmp, base, start + offsets[i]);
 5527       __ st2(v[2*i], v[2*i+1], T, tmp);
 5528     }
 5529   }
 5530 
 5531   // Helper routines for various flavours of Montgomery multiply
 5532 
 5533   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5534   // multiplications in parallel
 5535   //
 5536 
 5537   // See the montMul() method of the sun.security.provider.ML_DSA
 5538   // class.
 5539   //
 5540   // Computes 4x4S results or 8x8H results
 5541   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5542   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5543   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5544   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5545   // Outputs: va - 4x4S or 4x8H vector register sequences
 5546   // vb, vc, vtmp and vq must all be disjoint
 5547   // va must be disjoint from all other inputs/temps or must equal vc
 5548   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5549   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5550   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5551                    Assembler::SIMD_Arrangement T,
 5552                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5553     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5554     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5555     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5556     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5557 
 5558     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5559     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5560 
 5561     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5562 
 5563     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5564     assert(vs_disjoint(va, vb), "va and vb overlap");
 5565     assert(vs_disjoint(va, vq), "va and vq overlap");
 5566     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5567     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5568 
 5569     // schedule 4 streams of instructions across the vector sequences
 5570     for (int i = 0; i < 4; i++) {
 5571       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5572       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5573     }
 5574 
 5575     for (int i = 0; i < 4; i++) {
 5576       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5577     }
 5578 
 5579     for (int i = 0; i < 4; i++) {
 5580       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5581     }
 5582 
 5583     for (int i = 0; i < 4; i++) {
 5584       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5585     }
 5586   }
 5587 
 5588   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5589   // multiplications in parallel
 5590   //
 5591 
 5592   // See the montMul() method of the sun.security.provider.ML_DSA
 5593   // class.
 5594   //
 5595   // Computes 4x4S results or 8x8H results
 5596   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5597   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5598   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5599   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5600   // Outputs: va - 4x4S or 4x8H vector register sequences
 5601   // vb, vc, vtmp and vq must all be disjoint
 5602   // va must be disjoint from all other inputs/temps or must equal vc
 5603   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5604   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5605   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5606                    Assembler::SIMD_Arrangement T,
 5607                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5608     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5609     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5610     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5611     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5612 
 5613     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5614     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5615 
 5616     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5617 
 5618     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5619     assert(vs_disjoint(va, vb), "va and vb overlap");
 5620     assert(vs_disjoint(va, vq), "va and vq overlap");
 5621     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5622     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5623 
 5624     // schedule 2 streams of instructions across the vector sequences
 5625     for (int i = 0; i < 2; i++) {
 5626       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5627       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5628     }
 5629 
 5630     for (int i = 0; i < 2; i++) {
 5631       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5632     }
 5633 
 5634     for (int i = 0; i < 2; i++) {
 5635       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5636     }
 5637 
 5638     for (int i = 0; i < 2; i++) {
 5639       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5640     }
 5641   }
 5642 
 5643   // Perform 16 16-bit Montgomery multiplications in parallel.
 5644   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5645                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5646     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5647     // It will assert that the register use is valid
 5648     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5649   }
 5650 
 5651   // Perform 32 16-bit Montgomery multiplications in parallel.
 5652   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5653                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5654     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5655     // It will assert that the register use is valid
 5656     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5657   }
 5658 
 5659   // Perform 64 16-bit Montgomery multiplications in parallel.
 5660   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5661                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5662     // Schedule two successive 4x8H multiplies via the montmul helper
 5663     // on the front and back halves of va, vb and vc. The helper will
 5664     // assert that the register use has no overlap conflicts on each
 5665     // individual call but we also need to ensure that the necessary
 5666     // disjoint/equality constraints are met across both calls.
 5667 
 5668     // vb, vc, vtmp and vq must be disjoint. va must either be
 5669     // disjoint from all other registers or equal vc
 5670 
 5671     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5672     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5673     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5674 
 5675     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5676     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5677 
 5678     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5679 
 5680     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5681     assert(vs_disjoint(va, vb), "va and vb overlap");
 5682     assert(vs_disjoint(va, vq), "va and vq overlap");
 5683     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5684 
 5685     // we multiply the front and back halves of each sequence 4 at a
 5686     // time because
 5687     //
 5688     // 1) we are currently only able to get 4-way instruction
 5689     // parallelism at best
 5690     //
 5691     // 2) we need registers for the constants in vq and temporary
 5692     // scratch registers to hold intermediate results so vtmp can only
 5693     // be a VSeq<4> which means we only have 4 scratch slots
 5694 
 5695     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5696     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5697   }
 5698 
 5699   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5700                                const VSeq<4>& vc,
 5701                                const VSeq<4>& vtmp,
 5702                                const VSeq<2>& vq) {
 5703     // compute a = montmul(a1, c)
 5704     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5705     // ouptut a1 = a0 - a
 5706     vs_subv(va1, __ T8H, va0, vc);
 5707     //    and a0 = a0 + a
 5708     vs_addv(va0, __ T8H, va0, vc);
 5709   }
 5710 
 5711   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5712                                const VSeq<4>& vb,
 5713                                const VSeq<4>& vtmp1,
 5714                                const VSeq<4>& vtmp2,
 5715                                const VSeq<2>& vq) {
 5716     // compute c = a0 - a1
 5717     vs_subv(vtmp1, __ T8H, va0, va1);
 5718     // output a0 = a0 + a1
 5719     vs_addv(va0, __ T8H, va0, va1);
 5720     // output a1 = b montmul c
 5721     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5722   }
 5723 
 5724   void load64shorts(const VSeq<8>& v, Register shorts) {
 5725     vs_ldpq_post(v, shorts);
 5726   }
 5727 
 5728   void load32shorts(const VSeq<4>& v, Register shorts) {
 5729     vs_ldpq_post(v, shorts);
 5730   }
 5731 
 5732   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5733     vs_stpq_post(v, tmpAddr);
 5734   }
 5735 
 5736   // Kyber NTT function.
 5737   // Implements
 5738   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5739   //
 5740   // coeffs (short[256]) = c_rarg0
 5741   // ntt_zetas (short[256]) = c_rarg1
 5742   address generate_kyberNtt() {
 5743     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5744     int entry_count = StubInfo::entry_count(stub_id);
 5745     assert(entry_count == 1, "sanity check");
 5746     address start = load_archive_data(stub_id);
 5747     if (start != nullptr) {
 5748       return start;
 5749     }
 5750     __ align(CodeEntryAlignment);
 5751     StubCodeMark mark(this, stub_id);
 5752     start = __ pc();
 5753     __ enter();
 5754 
 5755     const Register coeffs = c_rarg0;
 5756     const Register zetas = c_rarg1;
 5757 
 5758     const Register kyberConsts = r10;
 5759     const Register tmpAddr = r11;
 5760 
 5761     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5762     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5763     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5764 
 5765     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5766     // load the montmul constants
 5767     vs_ldpq(vq, kyberConsts);
 5768 
 5769     // Each level corresponds to an iteration of the outermost loop of the
 5770     // Java method seilerNTT(int[] coeffs). There are some differences
 5771     // from what is done in the seilerNTT() method, though:
 5772     // 1. The computation is using 16-bit signed values, we do not convert them
 5773     // to ints here.
 5774     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5775     // this array for each level, it is easier that way to fill up the vector
 5776     // registers.
 5777     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5778     // multiplications (this is because that way there should not be any
 5779     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5780     // that we can use the 16-bit arithmetic in the vector unit.
 5781     //
 5782     // On each level, we fill up the vector registers in such a way that the
 5783     // array elements that need to be multiplied by the zetas go into one
 5784     // set of vector registers while the corresponding ones that don't need to
 5785     // be multiplied, go into another set.
 5786     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5787     // registers interleaving the steps of 4 identical computations,
 5788     // each done on 8 16-bit values per register.
 5789 
 5790     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5791     // to the zetas occur in discrete blocks whose size is some multiple
 5792     // of 32.
 5793 
 5794     // level 0
 5795     __ add(tmpAddr, coeffs, 256);
 5796     load64shorts(vs1, tmpAddr);
 5797     load64shorts(vs2, zetas);
 5798     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5799     __ add(tmpAddr, coeffs, 0);
 5800     load64shorts(vs1, tmpAddr);
 5801     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5802     vs_addv(vs1, __ T8H, vs1, vs2);
 5803     __ add(tmpAddr, coeffs, 0);
 5804     vs_stpq_post(vs1, tmpAddr);
 5805     __ add(tmpAddr, coeffs, 256);
 5806     vs_stpq_post(vs3, tmpAddr);
 5807     // restore montmul constants
 5808     vs_ldpq(vq, kyberConsts);
 5809     load64shorts(vs1, tmpAddr);
 5810     load64shorts(vs2, zetas);
 5811     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5812     __ add(tmpAddr, coeffs, 128);
 5813     load64shorts(vs1, tmpAddr);
 5814     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5815     vs_addv(vs1, __ T8H, vs1, vs2);
 5816     __ add(tmpAddr, coeffs, 128);
 5817     store64shorts(vs1, tmpAddr);
 5818     __ add(tmpAddr, coeffs, 384);
 5819     store64shorts(vs3, tmpAddr);
 5820 
 5821     // level 1
 5822     // restore montmul constants
 5823     vs_ldpq(vq, kyberConsts);
 5824     __ add(tmpAddr, coeffs, 128);
 5825     load64shorts(vs1, tmpAddr);
 5826     load64shorts(vs2, zetas);
 5827     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5828     __ add(tmpAddr, coeffs, 0);
 5829     load64shorts(vs1, tmpAddr);
 5830     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5831     vs_addv(vs1, __ T8H, vs1, vs2);
 5832     __ add(tmpAddr, coeffs, 0);
 5833     store64shorts(vs1, tmpAddr);
 5834     store64shorts(vs3, tmpAddr);
 5835     vs_ldpq(vq, kyberConsts);
 5836     __ add(tmpAddr, coeffs, 384);
 5837     load64shorts(vs1, tmpAddr);
 5838     load64shorts(vs2, zetas);
 5839     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5840     __ add(tmpAddr, coeffs, 256);
 5841     load64shorts(vs1, tmpAddr);
 5842     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5843     vs_addv(vs1, __ T8H, vs1, vs2);
 5844     __ add(tmpAddr, coeffs, 256);
 5845     store64shorts(vs1, tmpAddr);
 5846     store64shorts(vs3, tmpAddr);
 5847 
 5848     // level 2
 5849     vs_ldpq(vq, kyberConsts);
 5850     int offsets1[4] = { 0, 32, 128, 160 };
 5851     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5852     load64shorts(vs2, zetas);
 5853     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5854     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5855     // kyber_subv_addv64();
 5856     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5857     vs_addv(vs1, __ T8H, vs1, vs2);
 5858     __ add(tmpAddr, coeffs, 0);
 5859     vs_stpq_post(vs_front(vs1), tmpAddr);
 5860     vs_stpq_post(vs_front(vs3), tmpAddr);
 5861     vs_stpq_post(vs_back(vs1), tmpAddr);
 5862     vs_stpq_post(vs_back(vs3), tmpAddr);
 5863     vs_ldpq(vq, kyberConsts);
 5864     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5865     load64shorts(vs2, zetas);
 5866     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5867     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5868     // kyber_subv_addv64();
 5869     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5870     vs_addv(vs1, __ T8H, vs1, vs2);
 5871     __ add(tmpAddr, coeffs, 256);
 5872     vs_stpq_post(vs_front(vs1), tmpAddr);
 5873     vs_stpq_post(vs_front(vs3), tmpAddr);
 5874     vs_stpq_post(vs_back(vs1), tmpAddr);
 5875     vs_stpq_post(vs_back(vs3), tmpAddr);
 5876 
 5877     // level 3
 5878     vs_ldpq(vq, kyberConsts);
 5879     int offsets2[4] = { 0, 64, 128, 192 };
 5880     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5881     load64shorts(vs2, zetas);
 5882     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5883     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5884     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5885     vs_addv(vs1, __ T8H, vs1, vs2);
 5886     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5887     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5888 
 5889     vs_ldpq(vq, kyberConsts);
 5890     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5891     load64shorts(vs2, zetas);
 5892     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5893     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5894     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5895     vs_addv(vs1, __ T8H, vs1, vs2);
 5896     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5897     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5898 
 5899     // level 4
 5900     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5901     // so they are loaded using employing an ldr at 8 distinct offsets.
 5902 
 5903     vs_ldpq(vq, kyberConsts);
 5904     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5905     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5906     load64shorts(vs2, zetas);
 5907     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5908     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5909     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5910     vs_addv(vs1, __ T8H, vs1, vs2);
 5911     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5912     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5913 
 5914     vs_ldpq(vq, kyberConsts);
 5915     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5916     load64shorts(vs2, zetas);
 5917     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5918     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5919     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5920     vs_addv(vs1, __ T8H, vs1, vs2);
 5921     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5922     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5923 
 5924     // level 5
 5925     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5926     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5927 
 5928     vs_ldpq(vq, kyberConsts);
 5929     int offsets4[4] = { 0, 32, 64, 96 };
 5930     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5931     load32shorts(vs_front(vs2), zetas);
 5932     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5933     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5934     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5935     load32shorts(vs_front(vs2), zetas);
 5936     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5937     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5938     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5939     load32shorts(vs_front(vs2), zetas);
 5940     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5941     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5942 
 5943     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5944     load32shorts(vs_front(vs2), zetas);
 5945     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5946     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5947 
 5948     // level 6
 5949     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5950     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5951 
 5952     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5953     load32shorts(vs_front(vs2), zetas);
 5954     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5955     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5956     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5957     // __ ldpq(v18, v19, __ post(zetas, 32));
 5958     load32shorts(vs_front(vs2), zetas);
 5959     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5960     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5961 
 5962     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5963     load32shorts(vs_front(vs2), zetas);
 5964     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5965     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5966 
 5967     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5968     load32shorts(vs_front(vs2), zetas);
 5969     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5970     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5971 
 5972     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5973     __ mov(r0, zr); // return 0
 5974     __ ret(lr);
 5975 
 5976     // record the stub entry and end
 5977     store_archive_data(stub_id, start, __ pc());
 5978 
 5979     return start;
 5980   }
 5981 
 5982   // Kyber Inverse NTT function
 5983   // Implements
 5984   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5985   //
 5986   // coeffs (short[256]) = c_rarg0
 5987   // ntt_zetas (short[256]) = c_rarg1
 5988   address generate_kyberInverseNtt() {
 5989     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5990     int entry_count = StubInfo::entry_count(stub_id);
 5991     assert(entry_count == 1, "sanity check");
 5992     address start = load_archive_data(stub_id);
 5993     if (start != nullptr) {
 5994       return start;
 5995     }
 5996     __ align(CodeEntryAlignment);
 5997     StubCodeMark mark(this, stub_id);
 5998     start = __ pc();
 5999     __ enter();
 6000 
 6001     const Register coeffs = c_rarg0;
 6002     const Register zetas = c_rarg1;
 6003 
 6004     const Register kyberConsts = r10;
 6005     const Register tmpAddr = r11;
 6006     const Register tmpAddr2 = c_rarg2;
 6007 
 6008     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 6009     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6010     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6011 
 6012     __ lea(kyberConsts,
 6013              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6014 
 6015     // level 0
 6016     // At level 0 related coefficients occur in discrete blocks of size 4 so
 6017     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 6018 
 6019     vs_ldpq(vq, kyberConsts);
 6020     int offsets4[4] = { 0, 32, 64, 96 };
 6021     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6022     load32shorts(vs_front(vs2), zetas);
 6023     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6024                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6025     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6026     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6027     load32shorts(vs_front(vs2), zetas);
 6028     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6029                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6030     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6031     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6032     load32shorts(vs_front(vs2), zetas);
 6033     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6034                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6035     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6036     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6037     load32shorts(vs_front(vs2), zetas);
 6038     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6039                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6040     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6041 
 6042     // level 1
 6043     // At level 1 related coefficients occur in discrete blocks of size 8 so
 6044     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 6045 
 6046     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6047     load32shorts(vs_front(vs2), zetas);
 6048     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6049                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6050     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6051     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6052     load32shorts(vs_front(vs2), zetas);
 6053     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6054                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6055     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6056 
 6057     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6058     load32shorts(vs_front(vs2), zetas);
 6059     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6060                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6061     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6062     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6063     load32shorts(vs_front(vs2), zetas);
 6064     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6065                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6066     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6067 
 6068     // level 2
 6069     // At level 2 coefficients occur in 8 discrete blocks of size 16
 6070     // so they are loaded using employing an ldr at 8 distinct offsets.
 6071 
 6072     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6073     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6074     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6075     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6076     vs_subv(vs1, __ T8H, vs1, vs2);
 6077     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 6078     load64shorts(vs2, zetas);
 6079     vs_ldpq(vq, kyberConsts);
 6080     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6081     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6082 
 6083     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6084     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6085     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6086     vs_subv(vs1, __ T8H, vs1, vs2);
 6087     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 6088     load64shorts(vs2, zetas);
 6089     vs_ldpq(vq, kyberConsts);
 6090     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6091     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6092 
 6093     // Barrett reduction at indexes where overflow may happen
 6094 
 6095     // load q and the multiplier for the Barrett reduction
 6096     __ add(tmpAddr, kyberConsts, 16);
 6097     vs_ldpq(vq, tmpAddr);
 6098 
 6099     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 6100     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 6101     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 6102     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6103     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6104     vs_sshr(vs2, __ T8H, vs2, 11);
 6105     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6106     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6107     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6108     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6109     vs_sshr(vs2, __ T8H, vs2, 11);
 6110     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6111     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6112 
 6113     // level 3
 6114     // From level 3 upwards coefficients occur in discrete blocks whose size is
 6115     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 6116 
 6117     int offsets2[4] = { 0, 64, 128, 192 };
 6118     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 6119     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 6120     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6121     vs_subv(vs1, __ T8H, vs1, vs2);
 6122     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 6123     load64shorts(vs2, zetas);
 6124     vs_ldpq(vq, kyberConsts);
 6125     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6126     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 6127 
 6128     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 6129     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6130     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6131     vs_subv(vs1, __ T8H, vs1, vs2);
 6132     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 6133     load64shorts(vs2, zetas);
 6134     vs_ldpq(vq, kyberConsts);
 6135     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6136     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6137 
 6138     // level 4
 6139 
 6140     int offsets1[4] = { 0, 32, 128, 160 };
 6141     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 6142     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 6143     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6144     vs_subv(vs1, __ T8H, vs1, vs2);
 6145     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 6146     load64shorts(vs2, zetas);
 6147     vs_ldpq(vq, kyberConsts);
 6148     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6149     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 6150 
 6151     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 6152     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6153     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6154     vs_subv(vs1, __ T8H, vs1, vs2);
 6155     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 6156     load64shorts(vs2, zetas);
 6157     vs_ldpq(vq, kyberConsts);
 6158     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6159     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6160 
 6161     // level 5
 6162 
 6163     __ add(tmpAddr, coeffs, 0);
 6164     load64shorts(vs1, tmpAddr);
 6165     __ add(tmpAddr, coeffs, 128);
 6166     load64shorts(vs2, tmpAddr);
 6167     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6168     vs_subv(vs1, __ T8H, vs1, vs2);
 6169     __ add(tmpAddr, coeffs, 0);
 6170     store64shorts(vs3, tmpAddr);
 6171     load64shorts(vs2, zetas);
 6172     vs_ldpq(vq, kyberConsts);
 6173     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6174     __ add(tmpAddr, coeffs, 128);
 6175     store64shorts(vs2, tmpAddr);
 6176 
 6177     load64shorts(vs1, tmpAddr);
 6178     __ add(tmpAddr, coeffs, 384);
 6179     load64shorts(vs2, tmpAddr);
 6180     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6181     vs_subv(vs1, __ T8H, vs1, vs2);
 6182     __ add(tmpAddr, coeffs, 256);
 6183     store64shorts(vs3, tmpAddr);
 6184     load64shorts(vs2, zetas);
 6185     vs_ldpq(vq, kyberConsts);
 6186     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6187     __ add(tmpAddr, coeffs, 384);
 6188     store64shorts(vs2, tmpAddr);
 6189 
 6190     // Barrett reduction at indexes where overflow may happen
 6191 
 6192     // load q and the multiplier for the Barrett reduction
 6193     __ add(tmpAddr, kyberConsts, 16);
 6194     vs_ldpq(vq, tmpAddr);
 6195 
 6196     int offsets0[2] = { 0, 256 };
 6197     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6198     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6199     vs_sshr(vs2, __ T8H, vs2, 11);
 6200     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6201     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6202 
 6203     // level 6
 6204 
 6205     __ add(tmpAddr, coeffs, 0);
 6206     load64shorts(vs1, tmpAddr);
 6207     __ add(tmpAddr, coeffs, 256);
 6208     load64shorts(vs2, tmpAddr);
 6209     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6210     vs_subv(vs1, __ T8H, vs1, vs2);
 6211     __ add(tmpAddr, coeffs, 0);
 6212     store64shorts(vs3, tmpAddr);
 6213     load64shorts(vs2, zetas);
 6214     vs_ldpq(vq, kyberConsts);
 6215     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6216     __ add(tmpAddr, coeffs, 256);
 6217     store64shorts(vs2, tmpAddr);
 6218 
 6219     __ add(tmpAddr, coeffs, 128);
 6220     load64shorts(vs1, tmpAddr);
 6221     __ add(tmpAddr, coeffs, 384);
 6222     load64shorts(vs2, tmpAddr);
 6223     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6224     vs_subv(vs1, __ T8H, vs1, vs2);
 6225     __ add(tmpAddr, coeffs, 128);
 6226     store64shorts(vs3, tmpAddr);
 6227     load64shorts(vs2, zetas);
 6228     vs_ldpq(vq, kyberConsts);
 6229     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6230     __ add(tmpAddr, coeffs, 384);
 6231     store64shorts(vs2, tmpAddr);
 6232 
 6233     // multiply by 2^-n
 6234 
 6235     // load toMont(2^-n mod q)
 6236     __ add(tmpAddr, kyberConsts, 48);
 6237     __ ldr(v29, __ Q, tmpAddr);
 6238 
 6239     vs_ldpq(vq, kyberConsts);
 6240     __ add(tmpAddr, coeffs, 0);
 6241     load64shorts(vs1, tmpAddr);
 6242     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6243     __ add(tmpAddr, coeffs, 0);
 6244     store64shorts(vs2, tmpAddr);
 6245 
 6246     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 6247     load64shorts(vs1, tmpAddr);
 6248     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6249     __ add(tmpAddr, coeffs, 128);
 6250     store64shorts(vs2, tmpAddr);
 6251 
 6252     // now tmpAddr contains coeffs + 256
 6253     load64shorts(vs1, tmpAddr);
 6254     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6255     __ add(tmpAddr, coeffs, 256);
 6256     store64shorts(vs2, tmpAddr);
 6257 
 6258     // now tmpAddr contains coeffs + 384
 6259     load64shorts(vs1, tmpAddr);
 6260     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6261     __ add(tmpAddr, coeffs, 384);
 6262     store64shorts(vs2, tmpAddr);
 6263 
 6264     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6265     __ mov(r0, zr); // return 0
 6266     __ ret(lr);
 6267 
 6268     // record the stub entry and end
 6269     store_archive_data(stub_id, start, __ pc());
 6270 
 6271     return start;
 6272   }
 6273 
 6274   // Kyber multiply polynomials in the NTT domain.
 6275   // Implements
 6276   // static int implKyberNttMult(
 6277   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 6278   //
 6279   // result (short[256]) = c_rarg0
 6280   // ntta (short[256]) = c_rarg1
 6281   // nttb (short[256]) = c_rarg2
 6282   // zetas (short[128]) = c_rarg3
 6283   address generate_kyberNttMult() {
 6284     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 6285     int entry_count = StubInfo::entry_count(stub_id);
 6286     assert(entry_count == 1, "sanity check");
 6287     address start = load_archive_data(stub_id);
 6288     if (start != nullptr) {
 6289       return start;
 6290     }
 6291     __ align(CodeEntryAlignment);
 6292     StubCodeMark mark(this, stub_id);
 6293     start = __ pc();
 6294     __ enter();
 6295 
 6296     const Register result = c_rarg0;
 6297     const Register ntta = c_rarg1;
 6298     const Register nttb = c_rarg2;
 6299     const Register zetas = c_rarg3;
 6300 
 6301     const Register kyberConsts = r10;
 6302     const Register limit = r11;
 6303 
 6304     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 6305     VSeq<4> vs3(16), vs4(20);
 6306     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 6307     VSeq<2> vz(28);          // pair of zetas
 6308     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 6309 
 6310     __ lea(kyberConsts,
 6311              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6312 
 6313     Label kyberNttMult_loop;
 6314 
 6315     __ add(limit, result, 512);
 6316 
 6317     // load q and qinv
 6318     vs_ldpq(vq, kyberConsts);
 6319 
 6320     // load R^2 mod q (to convert back from Montgomery representation)
 6321     __ add(kyberConsts, kyberConsts, 64);
 6322     __ ldr(v27, __ Q, kyberConsts);
 6323 
 6324     __ BIND(kyberNttMult_loop);
 6325 
 6326     // load 16 zetas
 6327     vs_ldpq_post(vz, zetas);
 6328 
 6329     // load 2 sets of 32 coefficients from the two input arrays
 6330     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 6331     // are striped across pairs of vector registers
 6332     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 6333     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 6334     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 6335     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 6336 
 6337     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 6338     // i.e. montmul the first and second halves of vs1 in order and
 6339     // then with one sequence reversed storing the two results in vs3
 6340     //
 6341     // vs3[0] <- montmul(a0, b0)
 6342     // vs3[1] <- montmul(a1, b1)
 6343     // vs3[2] <- montmul(a0, b1)
 6344     // vs3[3] <- montmul(a1, b0)
 6345     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 6346     kyber_montmul16(vs_back(vs3),
 6347                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 6348 
 6349     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 6350     // i.e. montmul the first and second halves of vs4 in order and
 6351     // then with one sequence reversed storing the two results in vs1
 6352     //
 6353     // vs1[0] <- montmul(a2, b2)
 6354     // vs1[1] <- montmul(a3, b3)
 6355     // vs1[2] <- montmul(a2, b3)
 6356     // vs1[3] <- montmul(a3, b2)
 6357     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 6358     kyber_montmul16(vs_back(vs1),
 6359                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 6360 
 6361     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 6362     // We can schedule two montmuls at a time if we use a suitable vector
 6363     // sequence <vs3[1], vs1[1]>.
 6364     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 6365     VSeq<2> vs5(vs3[1], delta);
 6366 
 6367     // vs3[1] <- montmul(montmul(a1, b1), z0)
 6368     // vs1[1] <- montmul(montmul(a3, b3), z1)
 6369     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 6370 
 6371     // add results in pairs storing in vs3
 6372     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 6373     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 6374     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 6375 
 6376     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 6377     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 6378     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 6379 
 6380     // vs1 <- montmul(vs3, montRSquareModQ)
 6381     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 6382 
 6383     // store back the two pairs of result vectors de-interleaved as 8H elements
 6384     // i.e. storing each pairs of shorts striped across a register pair adjacent
 6385     // in memory
 6386     vs_st2_post(vs1, __ T8H, result);
 6387 
 6388     __ cmp(result, limit);
 6389     __ br(Assembler::NE, kyberNttMult_loop);
 6390 
 6391     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6392     __ mov(r0, zr); // return 0
 6393     __ ret(lr);
 6394 
 6395     // record the stub entry and end
 6396     store_archive_data(stub_id, start, __ pc());
 6397 
 6398     return start;
 6399   }
 6400 
 6401   // Kyber add 2 polynomials.
 6402   // Implements
 6403   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 6404   //
 6405   // result (short[256]) = c_rarg0
 6406   // a (short[256]) = c_rarg1
 6407   // b (short[256]) = c_rarg2
 6408   address generate_kyberAddPoly_2() {
 6409     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 6410     int entry_count = StubInfo::entry_count(stub_id);
 6411     assert(entry_count == 1, "sanity check");
 6412     address start = load_archive_data(stub_id);
 6413     if (start != nullptr) {
 6414       return start;
 6415     }
 6416     __ align(CodeEntryAlignment);
 6417     StubCodeMark mark(this, stub_id);
 6418     start = __ pc();
 6419     __ enter();
 6420 
 6421     const Register result = c_rarg0;
 6422     const Register a = c_rarg1;
 6423     const Register b = c_rarg2;
 6424 
 6425     const Register kyberConsts = r11;
 6426 
 6427     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 6428     // So, we can load, add and store the data in 3 groups of 11,
 6429     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 6430     // registers. A further constraint is that the mapping needs
 6431     // to skip callee saves. So, we allocate the register
 6432     // sequences using two 8 sequences, two 2 sequences and two
 6433     // single registers.
 6434     VSeq<8> vs1_1(0);
 6435     VSeq<2> vs1_2(16);
 6436     FloatRegister vs1_3 = v28;
 6437     VSeq<8> vs2_1(18);
 6438     VSeq<2> vs2_2(26);
 6439     FloatRegister vs2_3 = v29;
 6440 
 6441     // two constant vector sequences
 6442     VSeq<8> vc_1(31, 0);
 6443     VSeq<2> vc_2(31, 0);
 6444 
 6445     FloatRegister vc_3 = v31;
 6446     __ lea(kyberConsts,
 6447              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6448 
 6449     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6450     for (int i = 0; i < 3; i++) {
 6451       // load 80 or 88 values from a into vs1_1/2/3
 6452       vs_ldpq_post(vs1_1, a);
 6453       vs_ldpq_post(vs1_2, a);
 6454       if (i < 2) {
 6455         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6456       }
 6457       // load 80 or 88 values from b into vs2_1/2/3
 6458       vs_ldpq_post(vs2_1, b);
 6459       vs_ldpq_post(vs2_2, b);
 6460       if (i < 2) {
 6461         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6462       }
 6463       // sum 80 or 88 values across vs1 and vs2 into vs1
 6464       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6465       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6466       if (i < 2) {
 6467         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6468       }
 6469       // add constant to all 80 or 88 results
 6470       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6471       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6472       if (i < 2) {
 6473         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6474       }
 6475       // store 80 or 88 values
 6476       vs_stpq_post(vs1_1, result);
 6477       vs_stpq_post(vs1_2, result);
 6478       if (i < 2) {
 6479         __ str(vs1_3, __ Q, __ post(result, 16));
 6480       }
 6481     }
 6482 
 6483     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6484     __ mov(r0, zr); // return 0
 6485     __ ret(lr);
 6486 
 6487     // record the stub entry and end
 6488     store_archive_data(stub_id, start, __ pc());
 6489 
 6490     return start;
 6491   }
 6492 
 6493   // Kyber add 3 polynomials.
 6494   // Implements
 6495   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 6496   //
 6497   // result (short[256]) = c_rarg0
 6498   // a (short[256]) = c_rarg1
 6499   // b (short[256]) = c_rarg2
 6500   // c (short[256]) = c_rarg3
 6501   address generate_kyberAddPoly_3() {
 6502     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 6503     int entry_count = StubInfo::entry_count(stub_id);
 6504     assert(entry_count == 1, "sanity check");
 6505     address start = load_archive_data(stub_id);
 6506     if (start != nullptr) {
 6507       return start;
 6508     }
 6509     __ align(CodeEntryAlignment);
 6510     StubCodeMark mark(this, stub_id);
 6511     start = __ pc();
 6512     __ enter();
 6513 
 6514     const Register result = c_rarg0;
 6515     const Register a = c_rarg1;
 6516     const Register b = c_rarg2;
 6517     const Register c = c_rarg3;
 6518 
 6519     const Register kyberConsts = r11;
 6520 
 6521     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6522     // quadwords.  So, we can load, add and store the data in 3
 6523     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6524     // of 10 or 11 registers. A further constraint is that the
 6525     // mapping needs to skip callee saves. So, we allocate the
 6526     // register sequences using two 8 sequences, two 2 sequences
 6527     // and two single registers.
 6528     VSeq<8> vs1_1(0);
 6529     VSeq<2> vs1_2(16);
 6530     FloatRegister vs1_3 = v28;
 6531     VSeq<8> vs2_1(18);
 6532     VSeq<2> vs2_2(26);
 6533     FloatRegister vs2_3 = v29;
 6534 
 6535     // two constant vector sequences
 6536     VSeq<8> vc_1(31, 0);
 6537     VSeq<2> vc_2(31, 0);
 6538 
 6539     FloatRegister vc_3 = v31;
 6540 
 6541     __ lea(kyberConsts,
 6542              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6543 
 6544     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6545     for (int i = 0; i < 3; i++) {
 6546       // load 80 or 88 values from a into vs1_1/2/3
 6547       vs_ldpq_post(vs1_1, a);
 6548       vs_ldpq_post(vs1_2, a);
 6549       if (i < 2) {
 6550         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6551       }
 6552       // load 80 or 88 values from b into vs2_1/2/3
 6553       vs_ldpq_post(vs2_1, b);
 6554       vs_ldpq_post(vs2_2, b);
 6555       if (i < 2) {
 6556         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6557       }
 6558       // sum 80 or 88 values across vs1 and vs2 into vs1
 6559       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6560       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6561       if (i < 2) {
 6562         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6563       }
 6564       // load 80 or 88 values from c into vs2_1/2/3
 6565       vs_ldpq_post(vs2_1, c);
 6566       vs_ldpq_post(vs2_2, c);
 6567       if (i < 2) {
 6568         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6569       }
 6570       // sum 80 or 88 values across vs1 and vs2 into vs1
 6571       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6572       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6573       if (i < 2) {
 6574         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6575       }
 6576       // add constant to all 80 or 88 results
 6577       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6578       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6579       if (i < 2) {
 6580         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6581       }
 6582       // store 80 or 88 values
 6583       vs_stpq_post(vs1_1, result);
 6584       vs_stpq_post(vs1_2, result);
 6585       if (i < 2) {
 6586         __ str(vs1_3, __ Q, __ post(result, 16));
 6587       }
 6588     }
 6589 
 6590     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6591     __ mov(r0, zr); // return 0
 6592     __ ret(lr);
 6593 
 6594     // record the stub entry and end
 6595     store_archive_data(stub_id, start, __ pc());
 6596 
 6597     return start;
 6598   }
 6599 
 6600   // Kyber parse XOF output to polynomial coefficient candidates
 6601   // or decodePoly(12, ...).
 6602   // Implements
 6603   // static int implKyber12To16(
 6604   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6605   //
 6606   // we assume that parsed and condensed are allocated such that for
 6607   // n = (parsedLength + 63) / 64
 6608   // n blocks of 96 bytes of input can be processed, i.e.
 6609   // index + n * 96 <= condensed.length and
 6610   // n * 64 <= parsed.length
 6611   //
 6612   // condensed (byte[]) = c_rarg0
 6613   // condensedIndex = c_rarg1
 6614   // parsed (short[]) = c_rarg2
 6615   // parsedLength = c_rarg3
 6616   address generate_kyber12To16() {
 6617     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6618     int entry_count = StubInfo::entry_count(stub_id);
 6619     assert(entry_count == 1, "sanity check");
 6620     address start = load_archive_data(stub_id);
 6621     if (start != nullptr) {
 6622       return start;
 6623     }
 6624     Label L_F00, L_loop;
 6625 
 6626     __ align(CodeEntryAlignment);
 6627     StubCodeMark mark(this, stub_id);
 6628     start = __ pc();
 6629     __ enter();
 6630 
 6631     const Register condensed = c_rarg0;
 6632     const Register condensedOffs = c_rarg1;
 6633     const Register parsed = c_rarg2;
 6634     const Register parsedLength = c_rarg3;
 6635 
 6636     const Register tmpAddr = r11;
 6637 
 6638     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6639     // quadwords so we need a 6 vector sequence for the inputs.
 6640     // Parsing produces 64 shorts, employing two 8 vector
 6641     // sequences to store and combine the intermediate data.
 6642     VSeq<6> vin(24);
 6643     VSeq<8> va(0), vb(16);
 6644 
 6645     __ adr(tmpAddr, L_F00);
 6646     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6647     __ add(condensed, condensed, condensedOffs);
 6648 
 6649     __ BIND(L_loop);
 6650     // load 96 (6 x 16B) byte values
 6651     vs_ld3_post(vin, __ T16B, condensed);
 6652 
 6653     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6654     // holds 48 (16x3) contiguous bytes from memory striped
 6655     // horizontally across each of the 16 byte lanes. Equivalently,
 6656     // that is 16 pairs of 12-bit integers. Likewise the back half
 6657     // holds the next 48 bytes in the same arrangement.
 6658 
 6659     // Each vector in the front half can also be viewed as a vertical
 6660     // strip across the 16 pairs of 12 bit integers. Each byte in
 6661     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6662     // byte in vin[1] stores the high 4 bits of the first int and the
 6663     // low 4 bits of the second int. Each byte in vin[2] stores the
 6664     // high 8 bits of the second int. Likewise the vectors in second
 6665     // half.
 6666 
 6667     // Converting the data to 16-bit shorts requires first of all
 6668     // expanding each of the 6 x 16B vectors into 6 corresponding
 6669     // pairs of 8H vectors. Mask, shift and add operations on the
 6670     // resulting vector pairs can be used to combine 4 and 8 bit
 6671     // parts of related 8H vector elements.
 6672     //
 6673     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6674     // twice, one copy manipulated to provide the lower 4 bits
 6675     // belonging to the first short in a pair and another copy
 6676     // manipulated to provide the higher 4 bits belonging to the
 6677     // second short in a pair. This is why the the vector sequences va
 6678     // and vb used to hold the expanded 8H elements are of length 8.
 6679 
 6680     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6681     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6682     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6683     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6684     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6685     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6686     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6687     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6688 
 6689     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6690     // and vb[4:5]
 6691     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6692     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6693     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6694     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6695     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6696     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6697 
 6698     // shift lo byte of copy 1 of the middle stripe into the high byte
 6699     __ shl(va[2], __ T8H, va[2], 8);
 6700     __ shl(va[3], __ T8H, va[3], 8);
 6701     __ shl(vb[2], __ T8H, vb[2], 8);
 6702     __ shl(vb[3], __ T8H, vb[3], 8);
 6703 
 6704     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6705     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6706     // are in bit positions [4..11].
 6707     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6708     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6709     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6710     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6711 
 6712     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6713     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6714     // copy2
 6715     __ andr(va[2], __ T16B, va[2], v31);
 6716     __ andr(va[3], __ T16B, va[3], v31);
 6717     __ ushr(va[4], __ T8H, va[4], 4);
 6718     __ ushr(va[5], __ T8H, va[5], 4);
 6719     __ andr(vb[2], __ T16B, vb[2], v31);
 6720     __ andr(vb[3], __ T16B, vb[3], v31);
 6721     __ ushr(vb[4], __ T8H, vb[4], 4);
 6722     __ ushr(vb[5], __ T8H, vb[5], 4);
 6723 
 6724     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6725     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6726     // n.b. the ordering ensures: i) inputs are consumed before they
 6727     // are overwritten ii) the order of 16-bit results across successive
 6728     // pairs of vectors in va and then vb reflects the order of the
 6729     // corresponding 12-bit inputs
 6730     __ addv(va[0], __ T8H, va[0], va[2]);
 6731     __ addv(va[2], __ T8H, va[1], va[3]);
 6732     __ addv(va[1], __ T8H, va[4], va[6]);
 6733     __ addv(va[3], __ T8H, va[5], va[7]);
 6734     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6735     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6736     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6737     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6738 
 6739     // store 64 results interleaved as shorts
 6740     vs_st2_post(vs_front(va), __ T8H, parsed);
 6741     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6742 
 6743     __ sub(parsedLength, parsedLength, 64);
 6744     __ cmp(parsedLength, (u1)0);
 6745     __ br(Assembler::GT, L_loop);
 6746 
 6747     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6748     __ mov(r0, zr); // return 0
 6749     __ ret(lr);
 6750 
 6751     // bind label and generate constant data used by this stub
 6752     __ BIND(L_F00);
 6753     __ emit_int64(0x0f000f000f000f00);
 6754     __ emit_int64(0x0f000f000f000f00);
 6755 
 6756     // record the stub entry and end
 6757     store_archive_data(stub_id, start, __ pc());
 6758 
 6759     return start;
 6760   }
 6761 
 6762   // Kyber Barrett reduce function.
 6763   // Implements
 6764   // static int implKyberBarrettReduce(short[] coeffs) {}
 6765   //
 6766   // coeffs (short[256]) = c_rarg0
 6767   address generate_kyberBarrettReduce() {
 6768     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6769     int entry_count = StubInfo::entry_count(stub_id);
 6770     assert(entry_count == 1, "sanity check");
 6771     address start = load_archive_data(stub_id);
 6772     if (start != nullptr) {
 6773       return start;
 6774     }
 6775     __ align(CodeEntryAlignment);
 6776     StubCodeMark mark(this, stub_id);
 6777     start = __ pc();
 6778     __ enter();
 6779 
 6780     const Register coeffs = c_rarg0;
 6781 
 6782     const Register kyberConsts = r10;
 6783     const Register result = r11;
 6784 
 6785     // As above we process 256 sets of values in total i.e. 32 x
 6786     // 8H quadwords. So, we can load, add and store the data in 3
 6787     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6788     // of 10 or 11 registers. A further constraint is that the
 6789     // mapping needs to skip callee saves. So, we allocate the
 6790     // register sequences using two 8 sequences, two 2 sequences
 6791     // and two single registers.
 6792     VSeq<8> vs1_1(0);
 6793     VSeq<2> vs1_2(16);
 6794     FloatRegister vs1_3 = v28;
 6795     VSeq<8> vs2_1(18);
 6796     VSeq<2> vs2_2(26);
 6797     FloatRegister vs2_3 = v29;
 6798 
 6799     // we also need a pair of corresponding constant sequences
 6800 
 6801     VSeq<8> vc1_1(30, 0);
 6802     VSeq<2> vc1_2(30, 0);
 6803     FloatRegister vc1_3 = v30; // for kyber_q
 6804 
 6805     VSeq<8> vc2_1(31, 0);
 6806     VSeq<2> vc2_2(31, 0);
 6807     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6808 
 6809     __ add(result, coeffs, 0);
 6810     __ lea(kyberConsts,
 6811              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6812 
 6813     // load q and the multiplier for the Barrett reduction
 6814     __ add(kyberConsts, kyberConsts, 16);
 6815     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6816 
 6817     for (int i = 0; i < 3; i++) {
 6818       // load 80 or 88 coefficients
 6819       vs_ldpq_post(vs1_1, coeffs);
 6820       vs_ldpq_post(vs1_2, coeffs);
 6821       if (i < 2) {
 6822         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6823       }
 6824 
 6825       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6826       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6827       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6828       if (i < 2) {
 6829         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6830       }
 6831 
 6832       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6833       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6834       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6835       if (i < 2) {
 6836         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6837       }
 6838 
 6839       // vs1 <- vs1 - vs2 * kyber_q
 6840       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6841       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6842       if (i < 2) {
 6843         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6844       }
 6845 
 6846       vs_stpq_post(vs1_1, result);
 6847       vs_stpq_post(vs1_2, result);
 6848       if (i < 2) {
 6849         __ str(vs1_3, __ Q, __ post(result, 16));
 6850       }
 6851     }
 6852 
 6853     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6854     __ mov(r0, zr); // return 0
 6855     __ ret(lr);
 6856 
 6857     // record the stub entry and end
 6858     store_archive_data(stub_id, start, __ pc());
 6859 
 6860     return start;
 6861   }
 6862 
 6863 
 6864   // Dilithium-specific montmul helper routines that generate parallel
 6865   // code for, respectively, a single 4x4s vector sequence montmul or
 6866   // two such multiplies in a row.
 6867 
 6868   // Perform 16 32-bit Montgomery multiplications in parallel
 6869   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6870                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6871     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6872     // It will assert that the register use is valid
 6873     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6874   }
 6875 
 6876   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6877   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6878                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6879     // Schedule two successive 4x4S multiplies via the montmul helper
 6880     // on the front and back halves of va, vb and vc. The helper will
 6881     // assert that the register use has no overlap conflicts on each
 6882     // individual call but we also need to ensure that the necessary
 6883     // disjoint/equality constraints are met across both calls.
 6884 
 6885     // vb, vc, vtmp and vq must be disjoint. va must either be
 6886     // disjoint from all other registers or equal vc
 6887 
 6888     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6889     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6890     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6891 
 6892     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6893     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6894 
 6895     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6896 
 6897     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6898     assert(vs_disjoint(va, vb), "va and vb overlap");
 6899     assert(vs_disjoint(va, vq), "va and vq overlap");
 6900     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6901 
 6902     // We multiply the front and back halves of each sequence 4 at a
 6903     // time because
 6904     //
 6905     // 1) we are currently only able to get 4-way instruction
 6906     // parallelism at best
 6907     //
 6908     // 2) we need registers for the constants in vq and temporary
 6909     // scratch registers to hold intermediate results so vtmp can only
 6910     // be a VSeq<4> which means we only have 4 scratch slots.
 6911 
 6912     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6913     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6914   }
 6915 
 6916   // Perform combined montmul then add/sub on 4x4S vectors.
 6917   void dilithium_montmul16_sub_add(
 6918           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6919           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6920     // compute a = montmul(a1, c)
 6921     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6922     // ouptut a1 = a0 - a
 6923     vs_subv(va1, __ T4S, va0, vc);
 6924     //    and a0 = a0 + a
 6925     vs_addv(va0, __ T4S, va0, vc);
 6926   }
 6927 
 6928   // Perform combined add/sub then montul on 4x4S vectors.
 6929   void dilithium_sub_add_montmul16(
 6930           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6931           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6932     // compute c = a0 - a1
 6933     vs_subv(vtmp1, __ T4S, va0, va1);
 6934     // output a0 = a0 + a1
 6935     vs_addv(va0, __ T4S, va0, va1);
 6936     // output a1 = b montmul c
 6937     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6938   }
 6939 
 6940   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6941   // in the Java implementation come in sequences of at least 8, so we
 6942   // can use ldpq to collect the corresponding data into pairs of vector
 6943   // registers.
 6944   // We collect the coefficients corresponding to the 'j+l' indexes into
 6945   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6946   // then we do the (Montgomery) multiplications by the zetas in parallel
 6947   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6948   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6949   // v0-v7 and finally save the results back to the coeffs array.
 6950   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6951     const Register coeffs, const Register zetas) {
 6952     int c1 = 0;
 6953     int c2 = 512;
 6954     int startIncr;
 6955     // don't use callee save registers v8 - v15
 6956     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6957     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6958     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6959     int offsets[4] = { 0, 32, 64, 96 };
 6960 
 6961     for (int level = 0; level < 5; level++) {
 6962       int c1Start = c1;
 6963       int c2Start = c2;
 6964       if (level == 3) {
 6965         offsets[1] = 32;
 6966         offsets[2] = 128;
 6967         offsets[3] = 160;
 6968       } else if (level == 4) {
 6969         offsets[1] = 64;
 6970         offsets[2] = 128;
 6971         offsets[3] = 192;
 6972       }
 6973 
 6974       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6975       // time at 4 different offsets and multiply them in order by the
 6976       // next set of input values. So we employ indexed load and store
 6977       // pair instructions with arrangement 4S.
 6978       for (int i = 0; i < 4; i++) {
 6979         // reload q and qinv
 6980         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6981         // load 8x4S coefficients via second start pos == c2
 6982         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6983         // load next 8x4S inputs == b
 6984         vs_ldpq_post(vs2, zetas);
 6985         // compute a == c2 * b mod MONT_Q
 6986         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6987         // load 8x4s coefficients via first start pos == c1
 6988         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6989         // compute a1 =  c1 + a
 6990         vs_addv(vs3, __ T4S, vs1, vs2);
 6991         // compute a2 =  c1 - a
 6992         vs_subv(vs1, __ T4S, vs1, vs2);
 6993         // output a1 and a2
 6994         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6995         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6996 
 6997         int k = 4 * level + i;
 6998 
 6999         if (k > 7) {
 7000           startIncr = 256;
 7001         } else if (k == 5) {
 7002           startIncr = 384;
 7003         } else {
 7004           startIncr = 128;
 7005         }
 7006 
 7007         c1Start += startIncr;
 7008         c2Start += startIncr;
 7009       }
 7010 
 7011       c2 /= 2;
 7012     }
 7013   }
 7014 
 7015   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 7016   // Implements the method
 7017   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 7018   // of the Java class sun.security.provider
 7019   //
 7020   // coeffs (int[256]) = c_rarg0
 7021   // zetas (int[256]) = c_rarg1
 7022   address generate_dilithiumAlmostNtt() {
 7023     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 7024     int entry_count = StubInfo::entry_count(stub_id);
 7025     assert(entry_count == 1, "sanity check");
 7026     address start = load_archive_data(stub_id);
 7027     if (start != nullptr) {
 7028       return start;
 7029     }
 7030     __ align(CodeEntryAlignment);
 7031     StubCodeMark mark(this, stub_id);
 7032     start = __ pc();
 7033     __ enter();
 7034 
 7035     const Register coeffs = c_rarg0;
 7036     const Register zetas = c_rarg1;
 7037 
 7038     const Register tmpAddr = r9;
 7039     const Register dilithiumConsts = r10;
 7040     const Register result = r11;
 7041     // don't use callee save registers v8 - v15
 7042     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7043     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7044     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7045     int offsets[4] = { 0, 32, 64, 96};
 7046     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7047     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7048     __ add(result, coeffs, 0);
 7049     __ lea(dilithiumConsts,
 7050              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7051 
 7052     // Each level represents one iteration of the outer for loop of the Java version.
 7053 
 7054     // level 0-4
 7055     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 7056 
 7057     // level 5
 7058 
 7059     // At level 5 the coefficients we need to combine with the zetas
 7060     // are grouped in memory in blocks of size 4. So, for both sets of
 7061     // coefficients we load 4 adjacent values at 8 different offsets
 7062     // using an indexed ldr with register variant Q and multiply them
 7063     // in sequence order by the next set of inputs. Likewise we store
 7064     // the resuls using an indexed str with register variant Q.
 7065     for (int i = 0; i < 1024; i += 256) {
 7066       // reload constants q, qinv each iteration as they get clobbered later
 7067       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7068       // load 32 (8x4S) coefficients via first offsets = c1
 7069       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7070       // load next 32 (8x4S) inputs = b
 7071       vs_ldpq_post(vs2, zetas);
 7072       // a = b montul c1
 7073       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7074       // load 32 (8x4S) coefficients via second offsets = c2
 7075       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 7076       // add/sub with result of multiply
 7077       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 7078       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 7079       // write back new coefficients using same offsets
 7080       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 7081       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 7082     }
 7083 
 7084     // level 6
 7085     // At level 6 the coefficients we need to combine with the zetas
 7086     // are grouped in memory in pairs, the first two being montmul
 7087     // inputs and the second add/sub inputs. We can still implement
 7088     // the montmul+sub+add using 4-way parallelism but only if we
 7089     // combine the coefficients with the zetas 16 at a time. We load 8
 7090     // adjacent values at 4 different offsets using an ld2 load with
 7091     // arrangement 2D. That interleaves the lower and upper halves of
 7092     // each pair of quadwords into successive vector registers. We
 7093     // then need to montmul the 4 even elements of the coefficients
 7094     // register sequence by the zetas in order and then add/sub the 4
 7095     // odd elements of the coefficients register sequence. We use an
 7096     // equivalent st2 operation to store the results back into memory
 7097     // de-interleaved.
 7098     for (int i = 0; i < 1024; i += 128) {
 7099       // reload constants q, qinv each iteration as they get clobbered later
 7100       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7101       // load interleaved 16 (4x2D) coefficients via offsets
 7102       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7103       // load next 16 (4x4S) inputs
 7104       vs_ldpq_post(vs_front(vs2), zetas);
 7105       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7106       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7107                                   vs_front(vs2), vtmp, vq);
 7108       // store interleaved 16 (4x2D) coefficients via offsets
 7109       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7110     }
 7111 
 7112     // level 7
 7113     // At level 7 the coefficients we need to combine with the zetas
 7114     // occur singly with montmul inputs alterating with add/sub
 7115     // inputs. Once again we can use 4-way parallelism to combine 16
 7116     // zetas at a time. However, we have to load 8 adjacent values at
 7117     // 4 different offsets using an ld2 load with arrangement 4S. That
 7118     // interleaves the the odd words of each pair into one
 7119     // coefficients vector register and the even words of the pair
 7120     // into the next register. We then need to montmul the 4 even
 7121     // elements of the coefficients register sequence by the zetas in
 7122     // order and then add/sub the 4 odd elements of the coefficients
 7123     // register sequence. We use an equivalent st2 operation to store
 7124     // the results back into memory de-interleaved.
 7125 
 7126     for (int i = 0; i < 1024; i += 128) {
 7127       // reload constants q, qinv each iteration as they get clobbered later
 7128       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7129       // load interleaved 16 (4x4S) coefficients via offsets
 7130       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7131       // load next 16 (4x4S) inputs
 7132       vs_ldpq_post(vs_front(vs2), zetas);
 7133       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7134       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7135                                   vs_front(vs2), vtmp, vq);
 7136       // store interleaved 16 (4x4S) coefficients via offsets
 7137       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7138     }
 7139     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7140     __ mov(r0, zr); // return 0
 7141     __ ret(lr);
 7142 
 7143     // record the stub entry and end
 7144     store_archive_data(stub_id, start, __ pc());
 7145 
 7146     return start;
 7147   }
 7148 
 7149   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 7150   // in the Java implementation come in sequences of at least 8, so we
 7151   // can use ldpq to collect the corresponding data into pairs of vector
 7152   // registers
 7153   // We collect the coefficients that correspond to the 'j's into vs1
 7154   // the coefficiets that correspond to the 'j+l's into vs2 then
 7155   // do the additions into vs3 and the subtractions into vs1 then
 7156   // save the result of the additions, load the zetas into vs2
 7157   // do the (Montgomery) multiplications by zeta in parallel into vs2
 7158   // finally save the results back to the coeffs array
 7159   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 7160     const Register coeffs, const Register zetas) {
 7161     int c1 = 0;
 7162     int c2 = 32;
 7163     int startIncr;
 7164     int offsets[4];
 7165     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7166     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7167     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7168 
 7169     offsets[0] = 0;
 7170 
 7171     for (int level = 3; level < 8; level++) {
 7172       int c1Start = c1;
 7173       int c2Start = c2;
 7174       if (level == 3) {
 7175         offsets[1] = 64;
 7176         offsets[2] = 128;
 7177         offsets[3] = 192;
 7178       } else if (level == 4) {
 7179         offsets[1] = 32;
 7180         offsets[2] = 128;
 7181         offsets[3] = 160;
 7182       } else {
 7183         offsets[1] = 32;
 7184         offsets[2] = 64;
 7185         offsets[3] = 96;
 7186       }
 7187 
 7188       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 7189       // time at 4 different offsets and multiply them in order by the
 7190       // next set of input values. So we employ indexed load and store
 7191       // pair instructions with arrangement 4S.
 7192       for (int i = 0; i < 4; i++) {
 7193         // load v1 32 (8x4S) coefficients relative to first start index
 7194         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7195         // load v2 32 (8x4S) coefficients relative to second start index
 7196         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 7197         // a0 = v1 + v2 -- n.b. clobbers vqs
 7198         vs_addv(vs3, __ T4S, vs1, vs2);
 7199         // a1 = v1 - v2
 7200         vs_subv(vs1, __ T4S, vs1, vs2);
 7201         // save a1 relative to first start index
 7202         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7203         // load constants q, qinv each iteration as they get clobbered above
 7204         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7205         // load b next 32 (8x4S) inputs
 7206         vs_ldpq_post(vs2, zetas);
 7207         // a = a1 montmul b
 7208         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7209         // save a relative to second start index
 7210         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 7211 
 7212         int k = 4 * level + i;
 7213 
 7214         if (k < 24) {
 7215           startIncr = 256;
 7216         } else if (k == 25) {
 7217           startIncr = 384;
 7218         } else {
 7219           startIncr = 128;
 7220         }
 7221 
 7222         c1Start += startIncr;
 7223         c2Start += startIncr;
 7224       }
 7225 
 7226       c2 *= 2;
 7227     }
 7228   }
 7229 
 7230   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 7231   // Implements the method
 7232   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 7233   // the sun.security.provider.ML_DSA class.
 7234   //
 7235   // coeffs (int[256]) = c_rarg0
 7236   // zetas (int[256]) = c_rarg1
 7237   address generate_dilithiumAlmostInverseNtt() {
 7238     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 7239     int entry_count = StubInfo::entry_count(stub_id);
 7240     assert(entry_count == 1, "sanity check");
 7241     address start = load_archive_data(stub_id);
 7242     if (start != nullptr) {
 7243       return start;
 7244     }
 7245     __ align(CodeEntryAlignment);
 7246     StubCodeMark mark(this, stub_id);
 7247     start = __ pc();
 7248     __ enter();
 7249 
 7250     const Register coeffs = c_rarg0;
 7251     const Register zetas = c_rarg1;
 7252 
 7253     const Register tmpAddr = r9;
 7254     const Register dilithiumConsts = r10;
 7255     const Register result = r11;
 7256     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7257     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 7258     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7259     int offsets[4] = { 0, 32, 64, 96 };
 7260     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7261     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7262 
 7263     __ add(result, coeffs, 0);
 7264     __ lea(dilithiumConsts,
 7265              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7266 
 7267     // Each level represents one iteration of the outer for loop of the Java version
 7268 
 7269     // level 0
 7270     // At level 0 we need to interleave adjacent quartets of
 7271     // coefficients before we multiply and add/sub by the next 16
 7272     // zetas just as we did for level 7 in the multiply code. So we
 7273     // load and store the values using an ld2/st2 with arrangement 4S.
 7274     for (int i = 0; i < 1024; i += 128) {
 7275       // load constants q, qinv
 7276       // n.b. this can be moved out of the loop as they do not get
 7277       // clobbered by first two loops
 7278       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7279       // a0/a1 load interleaved 32 (8x4S) coefficients
 7280       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7281       // b load next 32 (8x4S) inputs
 7282       vs_ldpq_post(vs_front(vs2), zetas);
 7283       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7284       // n.b. second half of vs2 provides temporary register storage
 7285       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7286                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7287       // a0/a1 store interleaved 32 (8x4S) coefficients
 7288       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7289     }
 7290 
 7291     // level 1
 7292     // At level 1 we need to interleave pairs of adjacent pairs of
 7293     // coefficients before we multiply by the next 16 zetas just as we
 7294     // did for level 6 in the multiply code. So we load and store the
 7295     // values an ld2/st2 with arrangement 2D.
 7296     for (int i = 0; i < 1024; i += 128) {
 7297       // a0/a1 load interleaved 32 (8x2D) coefficients
 7298       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7299       // b load next 16 (4x4S) inputs
 7300       vs_ldpq_post(vs_front(vs2), zetas);
 7301       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7302       // n.b. second half of vs2 provides temporary register storage
 7303       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7304                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7305       // a0/a1 store interleaved 32 (8x2D) coefficients
 7306       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7307     }
 7308 
 7309     // level 2
 7310     // At level 2 coefficients come in blocks of 4. So, we load 4
 7311     // adjacent coefficients at 8 distinct offsets for both the first
 7312     // and second coefficient sequences, using an ldr with register
 7313     // variant Q then combine them with next set of 32 zetas. Likewise
 7314     // we store the results using an str with register variant Q.
 7315     for (int i = 0; i < 1024; i += 256) {
 7316       // c0 load 32 (8x4S) coefficients via first offsets
 7317       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7318       // c1 load 32 (8x4S) coefficients via second offsets
 7319       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 7320       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 7321       vs_addv(vs3, __ T4S, vs1, vs2);
 7322       // c = c0 - c1
 7323       vs_subv(vs1, __ T4S, vs1, vs2);
 7324       // store a0 32 (8x4S) coefficients via first offsets
 7325       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 7326       // b load 32 (8x4S) next inputs
 7327       vs_ldpq_post(vs2, zetas);
 7328       // reload constants q, qinv -- they were clobbered earlier
 7329       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7330       // compute a1 = b montmul c
 7331       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7332       // store a1 32 (8x4S) coefficients via second offsets
 7333       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 7334     }
 7335 
 7336     // level 3-7
 7337     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 7338 
 7339     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7340     __ mov(r0, zr); // return 0
 7341     __ ret(lr);
 7342 
 7343     // record the stub entry and end
 7344     store_archive_data(stub_id, start, __ pc());
 7345 
 7346     return start;
 7347   }
 7348 
 7349   // Dilithium multiply polynomials in the NTT domain.
 7350   // Straightforward implementation of the method
 7351   // static int implDilithiumNttMult(
 7352   //              int[] result, int[] ntta, int[] nttb {} of
 7353   // the sun.security.provider.ML_DSA class.
 7354   //
 7355   // result (int[256]) = c_rarg0
 7356   // poly1 (int[256]) = c_rarg1
 7357   // poly2 (int[256]) = c_rarg2
 7358   address generate_dilithiumNttMult() {
 7359     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 7360     int entry_count = StubInfo::entry_count(stub_id);
 7361     assert(entry_count == 1, "sanity check");
 7362     address start = load_archive_data(stub_id);
 7363     if (start != nullptr) {
 7364       return start;
 7365     }
 7366     __ align(CodeEntryAlignment);
 7367     StubCodeMark mark(this, stub_id);
 7368     start = __ pc();
 7369     __ enter();
 7370 
 7371     Label L_loop;
 7372 
 7373     const Register result = c_rarg0;
 7374     const Register poly1 = c_rarg1;
 7375     const Register poly2 = c_rarg2;
 7376 
 7377     const Register dilithiumConsts = r10;
 7378     const Register len = r11;
 7379 
 7380     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7381     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7382     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7383     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 7384 
 7385     __ lea(dilithiumConsts,
 7386              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7387 
 7388     // load constants q, qinv
 7389     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7390     // load constant rSquare into v29
 7391     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 7392 
 7393     __ mov(len, zr);
 7394     __ add(len, len, 1024);
 7395 
 7396     __ BIND(L_loop);
 7397 
 7398     // b load 32 (8x4S) next inputs from poly1
 7399     vs_ldpq_post(vs1, poly1);
 7400     // c load 32 (8x4S) next inputs from poly2
 7401     vs_ldpq_post(vs2, poly2);
 7402     // compute a = b montmul c
 7403     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7404     // compute a = rsquare montmul a
 7405     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 7406     // save a 32 (8x4S) results
 7407     vs_stpq_post(vs2, result);
 7408 
 7409     __ sub(len, len, 128);
 7410     __ cmp(len, (u1)128);
 7411     __ br(Assembler::GE, L_loop);
 7412 
 7413     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7414     __ mov(r0, zr); // return 0
 7415     __ ret(lr);
 7416 
 7417     // record the stub entry and end
 7418     store_archive_data(stub_id, start, __ pc());
 7419 
 7420     return start;
 7421   }
 7422 
 7423   // Dilithium Motgomery multiply an array by a constant.
 7424   // A straightforward implementation of the method
 7425   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 7426   // of the sun.security.provider.MLDSA class
 7427   //
 7428   // coeffs (int[256]) = c_rarg0
 7429   // constant (int) = c_rarg1
 7430   address generate_dilithiumMontMulByConstant() {
 7431     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 7432     int entry_count = StubInfo::entry_count(stub_id);
 7433     assert(entry_count == 1, "sanity check");
 7434     address start = load_archive_data(stub_id);
 7435     if (start != nullptr) {
 7436       return start;
 7437     }
 7438     __ align(CodeEntryAlignment);
 7439     StubCodeMark mark(this, stub_id);
 7440     start = __ pc();
 7441     __ enter();
 7442 
 7443     Label L_loop;
 7444 
 7445     const Register coeffs = c_rarg0;
 7446     const Register constant = c_rarg1;
 7447 
 7448     const Register dilithiumConsts = r10;
 7449     const Register result = r11;
 7450     const Register len = r12;
 7451 
 7452     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7453     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7454     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7455     VSeq<8> vconst(29, 0);             // for montmul by constant
 7456 
 7457     // results track inputs
 7458     __ add(result, coeffs, 0);
 7459     __ lea(dilithiumConsts,
 7460              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7461 
 7462     // load constants q, qinv -- they do not get clobbered by first two loops
 7463     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7464     // copy caller supplied constant across vconst
 7465     __ dup(vconst[0], __ T4S, constant);
 7466     __ mov(len, zr);
 7467     __ add(len, len, 1024);
 7468 
 7469     __ BIND(L_loop);
 7470 
 7471     // load next 32 inputs
 7472     vs_ldpq_post(vs2, coeffs);
 7473     // mont mul by constant
 7474     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 7475     // write next 32 results
 7476     vs_stpq_post(vs2, result);
 7477 
 7478     __ sub(len, len, 128);
 7479     __ cmp(len, (u1)128);
 7480     __ br(Assembler::GE, L_loop);
 7481 
 7482     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7483     __ mov(r0, zr); // return 0
 7484     __ ret(lr);
 7485 
 7486     // record the stub entry and end
 7487     store_archive_data(stub_id, start, __ pc());
 7488 
 7489     return start;
 7490   }
 7491 
 7492   // Dilithium decompose poly.
 7493   // Implements the method
 7494   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 7495   // of the sun.security.provider.ML_DSA class
 7496   //
 7497   // input (int[256]) = c_rarg0
 7498   // lowPart (int[256]) = c_rarg1
 7499   // highPart (int[256]) = c_rarg2
 7500   // twoGamma2  (int) = c_rarg3
 7501   // multiplier (int) = c_rarg4
 7502   address generate_dilithiumDecomposePoly() {
 7503     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 7504     int entry_count = StubInfo::entry_count(stub_id);
 7505     assert(entry_count == 1, "sanity check");
 7506     address start = load_archive_data(stub_id);
 7507     if (start != nullptr) {
 7508       return start;
 7509     }
 7510     __ align(CodeEntryAlignment);
 7511     StubCodeMark mark(this, stub_id);
 7512     start = __ pc();
 7513     Label L_loop;
 7514 
 7515     const Register input = c_rarg0;
 7516     const Register lowPart = c_rarg1;
 7517     const Register highPart = c_rarg2;
 7518     const Register twoGamma2 = c_rarg3;
 7519     const Register multiplier = c_rarg4;
 7520 
 7521     const Register len = r9;
 7522     const Register dilithiumConsts = r10;
 7523     const Register tmp = r11;
 7524 
 7525     // 6 independent sets of 4x4s values
 7526     VSeq<4> vs1(0), vs2(4), vs3(8);
 7527     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7528 
 7529     // 7 constants for cross-multiplying
 7530     VSeq<4> one(25, 0);
 7531     VSeq<4> qminus1(26, 0);
 7532     VSeq<4> g2(27, 0);
 7533     VSeq<4> twog2(28, 0);
 7534     VSeq<4> mult(29, 0);
 7535     VSeq<4> q(30, 0);
 7536     VSeq<4> qadd(31, 0);
 7537 
 7538     __ enter();
 7539 
 7540     __ lea(dilithiumConsts,
 7541              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7542 
 7543     // save callee-saved registers
 7544     __ stpd(v8, v9, __ pre(sp, -64));
 7545     __ stpd(v10, v11, Address(sp, 16));
 7546     __ stpd(v12, v13, Address(sp, 32));
 7547     __ stpd(v14, v15, Address(sp, 48));
 7548 
 7549     // populate constant registers
 7550     __ mov(tmp, zr);
 7551     __ add(tmp, tmp, 1);
 7552     __ dup(one[0], __ T4S, tmp); // 1
 7553     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7554     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7555     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7556     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7557     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7558     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7559 
 7560     __ mov(len, zr);
 7561     __ add(len, len, 1024);
 7562 
 7563     __ BIND(L_loop);
 7564 
 7565     // load next 4x4S inputs interleaved: rplus --> vs1
 7566     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7567 
 7568     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7569     vs_addv(vtmp, __ T4S, vs1, qadd);
 7570     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7571     vs_mulv(vtmp, __ T4S, vtmp, q);
 7572     vs_subv(vs1, __ T4S, vs1, vtmp);
 7573 
 7574     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7575     vs_sshr(vtmp, __ T4S, vs1, 31);
 7576     vs_andr(vtmp, vtmp, q);
 7577     vs_addv(vs1, __ T4S, vs1, vtmp);
 7578 
 7579     // quotient --> vs2
 7580     // int quotient = (rplus * multiplier) >> 22;
 7581     vs_mulv(vtmp, __ T4S, vs1, mult);
 7582     vs_sshr(vs2, __ T4S, vtmp, 22);
 7583 
 7584     // r0 --> vs3
 7585     // int r0 = rplus - quotient * twoGamma2;
 7586     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7587     vs_subv(vs3, __ T4S, vs1, vtmp);
 7588 
 7589     // mask --> vs4
 7590     // int mask = (twoGamma2 - r0) >> 22;
 7591     vs_subv(vtmp, __ T4S, twog2, vs3);
 7592     vs_sshr(vs4, __ T4S, vtmp, 22);
 7593 
 7594     // r0 -= (mask & twoGamma2);
 7595     vs_andr(vtmp, vs4, twog2);
 7596     vs_subv(vs3, __ T4S, vs3, vtmp);
 7597 
 7598     //  quotient += (mask & 1);
 7599     vs_andr(vtmp, vs4, one);
 7600     vs_addv(vs2, __ T4S, vs2, vtmp);
 7601 
 7602     // mask = (twoGamma2 / 2 - r0) >> 31;
 7603     vs_subv(vtmp, __ T4S, g2, vs3);
 7604     vs_sshr(vs4, __ T4S, vtmp, 31);
 7605 
 7606     // r0 -= (mask & twoGamma2);
 7607     vs_andr(vtmp, vs4, twog2);
 7608     vs_subv(vs3, __ T4S, vs3, vtmp);
 7609 
 7610     // quotient += (mask & 1);
 7611     vs_andr(vtmp, vs4, one);
 7612     vs_addv(vs2, __ T4S, vs2, vtmp);
 7613 
 7614     // r1 --> vs5
 7615     // int r1 = rplus - r0 - (dilithium_q - 1);
 7616     vs_subv(vtmp, __ T4S, vs1, vs3);
 7617     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7618 
 7619     // r1 --> vs1 (overwriting rplus)
 7620     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7621     vs_negr(vtmp, __ T4S, vs5);
 7622     vs_orr(vtmp, vs5, vtmp);
 7623     vs_sshr(vs1, __ T4S, vtmp, 31);
 7624 
 7625     // r0 += ~r1;
 7626     vs_notr(vtmp, vs1);
 7627     vs_addv(vs3, __ T4S, vs3, vtmp);
 7628 
 7629     // r1 = r1 & quotient;
 7630     vs_andr(vs1, vs2, vs1);
 7631 
 7632     // store results inteleaved
 7633     // lowPart[m] = r0;
 7634     // highPart[m] = r1;
 7635     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7636     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7637 
 7638     __ sub(len, len, 64);
 7639     __ cmp(len, (u1)64);
 7640     __ br(Assembler::GE, L_loop);
 7641 
 7642     // restore callee-saved vector registers
 7643     __ ldpd(v14, v15, Address(sp, 48));
 7644     __ ldpd(v12, v13, Address(sp, 32));
 7645     __ ldpd(v10, v11, Address(sp, 16));
 7646     __ ldpd(v8, v9, __ post(sp, 64));
 7647 
 7648     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7649     __ mov(r0, zr); // return 0
 7650     __ ret(lr);
 7651 
 7652     // record the stub entry and end
 7653     store_archive_data(stub_id, start, __ pc());
 7654 
 7655     return start;
 7656   }
 7657 
 7658   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7659              Register tmp0, Register tmp1, Register tmp2) {
 7660     __ bic(tmp0, a2, a1); // for a0
 7661     __ bic(tmp1, a3, a2); // for a1
 7662     __ bic(tmp2, a4, a3); // for a2
 7663     __ eor(a2, a2, tmp2);
 7664     __ bic(tmp2, a0, a4); // for a3
 7665     __ eor(a3, a3, tmp2);
 7666     __ bic(tmp2, a1, a0); // for a4
 7667     __ eor(a0, a0, tmp0);
 7668     __ eor(a1, a1, tmp1);
 7669     __ eor(a4, a4, tmp2);
 7670   }
 7671 
 7672   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7673                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7674                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7675                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7676                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7677                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7678                         Register tmp0, Register tmp1, Register tmp2) {
 7679     __ eor3(tmp1, a4, a9, a14);
 7680     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7681     __ eor3(tmp2, a1, a6, a11);
 7682     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7683     __ rax1(tmp2, tmp0, tmp1); // d0
 7684     {
 7685 
 7686       Register tmp3, tmp4;
 7687       if (can_use_fp && can_use_r18) {
 7688         tmp3 = rfp;
 7689         tmp4 = r18_tls;
 7690       } else {
 7691         tmp3 = a4;
 7692         tmp4 = a9;
 7693         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7694       }
 7695 
 7696       __ eor3(tmp3, a0, a5, a10);
 7697       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7698       __ eor(a0, a0, tmp2);
 7699       __ eor(a5, a5, tmp2);
 7700       __ eor(a10, a10, tmp2);
 7701       __ eor(a15, a15, tmp2);
 7702       __ eor(a20, a20, tmp2); // d0(tmp2)
 7703       __ eor3(tmp3, a2, a7, a12);
 7704       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7705       __ rax1(tmp3, tmp4, tmp2); // d1
 7706       __ eor(a1, a1, tmp3);
 7707       __ eor(a6, a6, tmp3);
 7708       __ eor(a11, a11, tmp3);
 7709       __ eor(a16, a16, tmp3);
 7710       __ eor(a21, a21, tmp3); // d1(tmp3)
 7711       __ rax1(tmp3, tmp2, tmp0); // d3
 7712       __ eor3(tmp2, a3, a8, a13);
 7713       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7714       __ eor(a3, a3, tmp3);
 7715       __ eor(a8, a8, tmp3);
 7716       __ eor(a13, a13, tmp3);
 7717       __ eor(a18, a18, tmp3);
 7718       __ eor(a23, a23, tmp3);
 7719       __ rax1(tmp2, tmp1, tmp0); // d2
 7720       __ eor(a2, a2, tmp2);
 7721       __ eor(a7, a7, tmp2);
 7722       __ eor(a12, a12, tmp2);
 7723       __ rax1(tmp0, tmp0, tmp4); // d4
 7724       if (!can_use_fp || !can_use_r18) {
 7725         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7726       }
 7727       __ eor(a17, a17, tmp2);
 7728       __ eor(a22, a22, tmp2);
 7729       __ eor(a4, a4, tmp0);
 7730       __ eor(a9, a9, tmp0);
 7731       __ eor(a14, a14, tmp0);
 7732       __ eor(a19, a19, tmp0);
 7733       __ eor(a24, a24, tmp0);
 7734     }
 7735 
 7736     __ rol(tmp0, a10, 3);
 7737     __ rol(a10, a1, 1);
 7738     __ rol(a1, a6, 44);
 7739     __ rol(a6, a9, 20);
 7740     __ rol(a9, a22, 61);
 7741     __ rol(a22, a14, 39);
 7742     __ rol(a14, a20, 18);
 7743     __ rol(a20, a2, 62);
 7744     __ rol(a2, a12, 43);
 7745     __ rol(a12, a13, 25);
 7746     __ rol(a13, a19, 8) ;
 7747     __ rol(a19, a23, 56);
 7748     __ rol(a23, a15, 41);
 7749     __ rol(a15, a4, 27);
 7750     __ rol(a4, a24, 14);
 7751     __ rol(a24, a21, 2);
 7752     __ rol(a21, a8, 55);
 7753     __ rol(a8, a16, 45);
 7754     __ rol(a16, a5, 36);
 7755     __ rol(a5, a3, 28);
 7756     __ rol(a3, a18, 21);
 7757     __ rol(a18, a17, 15);
 7758     __ rol(a17, a11, 10);
 7759     __ rol(a11, a7, 6);
 7760     __ mov(a7, tmp0);
 7761 
 7762     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7763     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7764     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7765     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7766     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7767 
 7768     __ ldr(tmp1, __ post(rc, 8));
 7769     __ eor(a0, a0, tmp1);
 7770 
 7771   }
 7772 
 7773   // Arguments:
 7774   //
 7775   // Inputs:
 7776   //   c_rarg0   - byte[]  source+offset
 7777   //   c_rarg1   - byte[]  SHA.state
 7778   //   c_rarg2   - int     block_size
 7779   //   c_rarg3   - int     offset
 7780   //   c_rarg4   - int     limit
 7781   //
 7782   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7783     bool multi_block;
 7784     switch (stub_id) {
 7785     case StubId::stubgen_sha3_implCompress_id:
 7786       multi_block = false;
 7787       break;
 7788     case StubId::stubgen_sha3_implCompressMB_id:
 7789       multi_block = true;
 7790       break;
 7791     default:
 7792       ShouldNotReachHere();
 7793     }
 7794     int entry_count = StubInfo::entry_count(stub_id);
 7795     assert(entry_count == 1, "sanity check");
 7796     address start = load_archive_data(stub_id);
 7797     if (start != nullptr) {
 7798       return start;
 7799     }
 7800     __ align(CodeEntryAlignment);
 7801     StubCodeMark mark(this, stub_id);
 7802     start = __ pc();
 7803 
 7804     Register buf           = c_rarg0;
 7805     Register state         = c_rarg1;
 7806     Register block_size    = c_rarg2;
 7807     Register ofs           = c_rarg3;
 7808     Register limit         = c_rarg4;
 7809 
 7810     // use r3.r17,r19..r28 to keep a0..a24.
 7811     // a0..a24 are respective locals from SHA3.java
 7812     Register a0 = r25,
 7813              a1 = r26,
 7814              a2 = r27,
 7815              a3 = r3,
 7816              a4 = r4,
 7817              a5 = r5,
 7818              a6 = r6,
 7819              a7 = r7,
 7820              a8 = rscratch1, // r8
 7821              a9 = rscratch2, // r9
 7822              a10 = r10,
 7823              a11 = r11,
 7824              a12 = r12,
 7825              a13 = r13,
 7826              a14 = r14,
 7827              a15 = r15,
 7828              a16 = r16,
 7829              a17 = r17,
 7830              a18 = r28,
 7831              a19 = r19,
 7832              a20 = r20,
 7833              a21 = r21,
 7834              a22 = r22,
 7835              a23 = r23,
 7836              a24 = r24;
 7837 
 7838     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7839 
 7840     Label sha3_loop, rounds24_preloop, loop_body;
 7841     Label sha3_512_or_sha3_384, shake128;
 7842 
 7843     bool can_use_r18 = false;
 7844 #ifndef R18_RESERVED
 7845     can_use_r18 = true;
 7846 #endif
 7847     bool can_use_fp = !PreserveFramePointer;
 7848 
 7849     __ enter();
 7850 
 7851     // save almost all yet unsaved gpr registers on stack
 7852     __ str(block_size, __ pre(sp, -128));
 7853     if (multi_block) {
 7854       __ stpw(ofs, limit, Address(sp, 8));
 7855     }
 7856     // 8 bytes at sp+16 will be used to keep buf
 7857     __ stp(r19, r20, Address(sp, 32));
 7858     __ stp(r21, r22, Address(sp, 48));
 7859     __ stp(r23, r24, Address(sp, 64));
 7860     __ stp(r25, r26, Address(sp, 80));
 7861     __ stp(r27, r28, Address(sp, 96));
 7862     if (can_use_r18 && can_use_fp) {
 7863       __ stp(r18_tls, state, Address(sp, 112));
 7864     } else {
 7865       __ str(state, Address(sp, 112));
 7866     }
 7867 
 7868     // begin sha3 calculations: loading a0..a24 from state arrary
 7869     __ ldp(a0, a1, state);
 7870     __ ldp(a2, a3, Address(state, 16));
 7871     __ ldp(a4, a5, Address(state, 32));
 7872     __ ldp(a6, a7, Address(state, 48));
 7873     __ ldp(a8, a9, Address(state, 64));
 7874     __ ldp(a10, a11, Address(state, 80));
 7875     __ ldp(a12, a13, Address(state, 96));
 7876     __ ldp(a14, a15, Address(state, 112));
 7877     __ ldp(a16, a17, Address(state, 128));
 7878     __ ldp(a18, a19, Address(state, 144));
 7879     __ ldp(a20, a21, Address(state, 160));
 7880     __ ldp(a22, a23, Address(state, 176));
 7881     __ ldr(a24, Address(state, 192));
 7882 
 7883     __ BIND(sha3_loop);
 7884 
 7885     // load input
 7886     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7887     __ eor(a0, a0, tmp3);
 7888     __ eor(a1, a1, tmp2);
 7889     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7890     __ eor(a2, a2, tmp3);
 7891     __ eor(a3, a3, tmp2);
 7892     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7893     __ eor(a4, a4, tmp3);
 7894     __ eor(a5, a5, tmp2);
 7895     __ ldr(tmp3, __ post(buf, 8));
 7896     __ eor(a6, a6, tmp3);
 7897 
 7898     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7899     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7900 
 7901     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7902     __ eor(a7, a7, tmp3);
 7903     __ eor(a8, a8, tmp2);
 7904     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7905     __ eor(a9, a9, tmp3);
 7906     __ eor(a10, a10, tmp2);
 7907     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7908     __ eor(a11, a11, tmp3);
 7909     __ eor(a12, a12, tmp2);
 7910     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7911     __ eor(a13, a13, tmp3);
 7912     __ eor(a14, a14, tmp2);
 7913     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7914     __ eor(a15, a15, tmp3);
 7915     __ eor(a16, a16, tmp2);
 7916 
 7917     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7918     __ andw(tmp2, block_size, 48);
 7919     __ cbzw(tmp2, rounds24_preloop);
 7920     __ tbnz(block_size, 5, shake128);
 7921     // block_size == 144, bit5 == 0, SHA3-244
 7922     __ ldr(tmp3, __ post(buf, 8));
 7923     __ eor(a17, a17, tmp3);
 7924     __ b(rounds24_preloop);
 7925 
 7926     __ BIND(shake128);
 7927     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7928     __ eor(a17, a17, tmp3);
 7929     __ eor(a18, a18, tmp2);
 7930     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7931     __ eor(a19, a19, tmp3);
 7932     __ eor(a20, a20, tmp2);
 7933     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7934 
 7935     __ BIND(sha3_512_or_sha3_384);
 7936     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7937     __ eor(a7, a7, tmp3);
 7938     __ eor(a8, a8, tmp2);
 7939     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7940 
 7941     // SHA3-384
 7942     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7943     __ eor(a9, a9, tmp3);
 7944     __ eor(a10, a10, tmp2);
 7945     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7946     __ eor(a11, a11, tmp3);
 7947     __ eor(a12, a12, tmp2);
 7948 
 7949     __ BIND(rounds24_preloop);
 7950     __ fmovs(v0, 24.0); // float loop counter,
 7951     __ fmovs(v1, 1.0);  // exact representation
 7952 
 7953     __ str(buf, Address(sp, 16));
 7954     __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
 7955 
 7956     __ BIND(loop_body);
 7957     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7958                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7959                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7960                      tmp0, tmp1, tmp2);
 7961     __ fsubs(v0, v0, v1);
 7962     __ fcmps(v0, 0.0);
 7963     __ br(__ NE, loop_body);
 7964 
 7965     if (multi_block) {
 7966       __ ldrw(block_size, sp); // block_size
 7967       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7968       __ addw(tmp2, tmp2, block_size);
 7969       __ cmpw(tmp2, tmp1);
 7970       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7971       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7972       __ br(Assembler::LE, sha3_loop);
 7973       __ movw(c_rarg0, tmp2); // return offset
 7974     }
 7975     if (can_use_fp && can_use_r18) {
 7976       __ ldp(r18_tls, state, Address(sp, 112));
 7977     } else {
 7978       __ ldr(state, Address(sp, 112));
 7979     }
 7980     // save calculated sha3 state
 7981     __ stp(a0, a1, Address(state));
 7982     __ stp(a2, a3, Address(state, 16));
 7983     __ stp(a4, a5, Address(state, 32));
 7984     __ stp(a6, a7, Address(state, 48));
 7985     __ stp(a8, a9, Address(state, 64));
 7986     __ stp(a10, a11, Address(state, 80));
 7987     __ stp(a12, a13, Address(state, 96));
 7988     __ stp(a14, a15, Address(state, 112));
 7989     __ stp(a16, a17, Address(state, 128));
 7990     __ stp(a18, a19, Address(state, 144));
 7991     __ stp(a20, a21, Address(state, 160));
 7992     __ stp(a22, a23, Address(state, 176));
 7993     __ str(a24, Address(state, 192));
 7994 
 7995     // restore required registers from stack
 7996     __ ldp(r19, r20, Address(sp, 32));
 7997     __ ldp(r21, r22, Address(sp, 48));
 7998     __ ldp(r23, r24, Address(sp, 64));
 7999     __ ldp(r25, r26, Address(sp, 80));
 8000     __ ldp(r27, r28, Address(sp, 96));
 8001     if (can_use_fp && can_use_r18) {
 8002       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 8003     } // else no need to recalculate rfp, since it wasn't changed
 8004 
 8005     __ leave();
 8006 
 8007     __ ret(lr);
 8008 
 8009     // record the stub entry and end
 8010     store_archive_data(stub_id, start, __ pc());
 8011 
 8012     return start;
 8013   }
 8014 
 8015   /**
 8016    *  Arguments:
 8017    *
 8018    * Inputs:
 8019    *   c_rarg0   - int crc
 8020    *   c_rarg1   - byte* buf
 8021    *   c_rarg2   - int length
 8022    *
 8023    * Output:
 8024    *       rax   - int crc result
 8025    */
 8026   address generate_updateBytesCRC32() {
 8027     assert(UseCRC32Intrinsics, "what are we doing here?");
 8028     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 8029     int entry_count = StubInfo::entry_count(stub_id);
 8030     assert(entry_count == 1, "sanity check");
 8031     address start = load_archive_data(stub_id);
 8032     if (start != nullptr) {
 8033       return start;
 8034     }
 8035     __ align(CodeEntryAlignment);
 8036     StubCodeMark mark(this, stub_id);
 8037 
 8038     start = __ pc();
 8039 
 8040     const Register crc   = c_rarg0;  // crc
 8041     const Register buf   = c_rarg1;  // source java byte array address
 8042     const Register len   = c_rarg2;  // length
 8043     const Register table0 = c_rarg3; // crc_table address
 8044     const Register table1 = c_rarg4;
 8045     const Register table2 = c_rarg5;
 8046     const Register table3 = c_rarg6;
 8047     const Register tmp3 = c_rarg7;
 8048 
 8049     BLOCK_COMMENT("Entry:");
 8050     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8051 
 8052     __ kernel_crc32(crc, buf, len,
 8053               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 8054 
 8055     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8056     __ ret(lr);
 8057 
 8058     // record the stub entry and end
 8059     store_archive_data(stub_id, start, __ pc());
 8060 
 8061     return start;
 8062   }
 8063 
 8064   /**
 8065    *  Arguments:
 8066    *
 8067    * Inputs:
 8068    *   c_rarg0   - int crc
 8069    *   c_rarg1   - byte* buf
 8070    *   c_rarg2   - int length
 8071    *   c_rarg3   - int* table
 8072    *
 8073    * Output:
 8074    *       r0   - int crc result
 8075    */
 8076   address generate_updateBytesCRC32C() {
 8077     assert(UseCRC32CIntrinsics, "what are we doing here?");
 8078     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 8079     int entry_count = StubInfo::entry_count(stub_id);
 8080     assert(entry_count == 1, "sanity check");
 8081     address start = load_archive_data(stub_id);
 8082     if (start != nullptr) {
 8083       return start;
 8084     }
 8085     __ align(CodeEntryAlignment);
 8086     StubCodeMark mark(this, stub_id);
 8087 
 8088     start = __ pc();
 8089 
 8090     const Register crc   = c_rarg0;  // crc
 8091     const Register buf   = c_rarg1;  // source java byte array address
 8092     const Register len   = c_rarg2;  // length
 8093     const Register table0 = c_rarg3; // crc_table address
 8094     const Register table1 = c_rarg4;
 8095     const Register table2 = c_rarg5;
 8096     const Register table3 = c_rarg6;
 8097     const Register tmp3 = c_rarg7;
 8098 
 8099     BLOCK_COMMENT("Entry:");
 8100     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8101 
 8102     __ kernel_crc32c(crc, buf, len,
 8103               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 8104 
 8105     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8106     __ ret(lr);
 8107 
 8108     // record the stub entry and end
 8109     store_archive_data(stub_id, start, __ pc());
 8110 
 8111     return start;
 8112   }
 8113 
 8114   /***
 8115    *  Arguments:
 8116    *
 8117    *  Inputs:
 8118    *   c_rarg0   - int   adler
 8119    *   c_rarg1   - byte* buff
 8120    *   c_rarg2   - int   len
 8121    *
 8122    * Output:
 8123    *   c_rarg0   - int adler result
 8124    */
 8125   address generate_updateBytesAdler32() {
 8126     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 8127     int entry_count = StubInfo::entry_count(stub_id);
 8128     assert(entry_count == 1, "sanity check");
 8129     address start = load_archive_data(stub_id);
 8130     if (start != nullptr) {
 8131       return start;
 8132     }
 8133     __ align(CodeEntryAlignment);
 8134     StubCodeMark mark(this, stub_id);
 8135     start = __ pc();
 8136 
 8137     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 8138 
 8139     // Aliases
 8140     Register adler  = c_rarg0;
 8141     Register s1     = c_rarg0;
 8142     Register s2     = c_rarg3;
 8143     Register buff   = c_rarg1;
 8144     Register len    = c_rarg2;
 8145     Register nmax  = r4;
 8146     Register base  = r5;
 8147     Register count = r6;
 8148     Register temp0 = rscratch1;
 8149     Register temp1 = rscratch2;
 8150     FloatRegister vbytes = v0;
 8151     FloatRegister vs1acc = v1;
 8152     FloatRegister vs2acc = v2;
 8153     FloatRegister vtable = v3;
 8154 
 8155     // Max number of bytes we can process before having to take the mod
 8156     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 8157     uint64_t BASE = 0xfff1;
 8158     uint64_t NMAX = 0x15B0;
 8159 
 8160     __ mov(base, BASE);
 8161     __ mov(nmax, NMAX);
 8162 
 8163     // Load accumulation coefficients for the upper 16 bits
 8164     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 8165     __ ld1(vtable, __ T16B, Address(temp0));
 8166 
 8167     // s1 is initialized to the lower 16 bits of adler
 8168     // s2 is initialized to the upper 16 bits of adler
 8169     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 8170     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 8171 
 8172     // The pipelined loop needs at least 16 elements for 1 iteration
 8173     // It does check this, but it is more effective to skip to the cleanup loop
 8174     __ cmp(len, (u1)16);
 8175     __ br(Assembler::HS, L_nmax);
 8176     __ cbz(len, L_combine);
 8177 
 8178     __ bind(L_simple_by1_loop);
 8179     __ ldrb(temp0, Address(__ post(buff, 1)));
 8180     __ add(s1, s1, temp0);
 8181     __ add(s2, s2, s1);
 8182     __ subs(len, len, 1);
 8183     __ br(Assembler::HI, L_simple_by1_loop);
 8184 
 8185     // s1 = s1 % BASE
 8186     __ subs(temp0, s1, base);
 8187     __ csel(s1, temp0, s1, Assembler::HS);
 8188 
 8189     // s2 = s2 % BASE
 8190     __ lsr(temp0, s2, 16);
 8191     __ lsl(temp1, temp0, 4);
 8192     __ sub(temp1, temp1, temp0);
 8193     __ add(s2, temp1, s2, ext::uxth);
 8194 
 8195     __ subs(temp0, s2, base);
 8196     __ csel(s2, temp0, s2, Assembler::HS);
 8197 
 8198     __ b(L_combine);
 8199 
 8200     __ bind(L_nmax);
 8201     __ subs(len, len, nmax);
 8202     __ sub(count, nmax, 16);
 8203     __ br(Assembler::LO, L_by16);
 8204 
 8205     __ bind(L_nmax_loop);
 8206 
 8207     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 8208                                       vbytes, vs1acc, vs2acc, vtable);
 8209 
 8210     __ subs(count, count, 16);
 8211     __ br(Assembler::HS, L_nmax_loop);
 8212 
 8213     // s1 = s1 % BASE
 8214     __ lsr(temp0, s1, 16);
 8215     __ lsl(temp1, temp0, 4);
 8216     __ sub(temp1, temp1, temp0);
 8217     __ add(temp1, temp1, s1, ext::uxth);
 8218 
 8219     __ lsr(temp0, temp1, 16);
 8220     __ lsl(s1, temp0, 4);
 8221     __ sub(s1, s1, temp0);
 8222     __ add(s1, s1, temp1, ext:: uxth);
 8223 
 8224     __ subs(temp0, s1, base);
 8225     __ csel(s1, temp0, s1, Assembler::HS);
 8226 
 8227     // s2 = s2 % BASE
 8228     __ lsr(temp0, s2, 16);
 8229     __ lsl(temp1, temp0, 4);
 8230     __ sub(temp1, temp1, temp0);
 8231     __ add(temp1, temp1, s2, ext::uxth);
 8232 
 8233     __ lsr(temp0, temp1, 16);
 8234     __ lsl(s2, temp0, 4);
 8235     __ sub(s2, s2, temp0);
 8236     __ add(s2, s2, temp1, ext:: uxth);
 8237 
 8238     __ subs(temp0, s2, base);
 8239     __ csel(s2, temp0, s2, Assembler::HS);
 8240 
 8241     __ subs(len, len, nmax);
 8242     __ sub(count, nmax, 16);
 8243     __ br(Assembler::HS, L_nmax_loop);
 8244 
 8245     __ bind(L_by16);
 8246     __ adds(len, len, count);
 8247     __ br(Assembler::LO, L_by1);
 8248 
 8249     __ bind(L_by16_loop);
 8250 
 8251     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 8252                                       vbytes, vs1acc, vs2acc, vtable);
 8253 
 8254     __ subs(len, len, 16);
 8255     __ br(Assembler::HS, L_by16_loop);
 8256 
 8257     __ bind(L_by1);
 8258     __ adds(len, len, 15);
 8259     __ br(Assembler::LO, L_do_mod);
 8260 
 8261     __ bind(L_by1_loop);
 8262     __ ldrb(temp0, Address(__ post(buff, 1)));
 8263     __ add(s1, temp0, s1);
 8264     __ add(s2, s2, s1);
 8265     __ subs(len, len, 1);
 8266     __ br(Assembler::HS, L_by1_loop);
 8267 
 8268     __ bind(L_do_mod);
 8269     // s1 = s1 % BASE
 8270     __ lsr(temp0, s1, 16);
 8271     __ lsl(temp1, temp0, 4);
 8272     __ sub(temp1, temp1, temp0);
 8273     __ add(temp1, temp1, s1, ext::uxth);
 8274 
 8275     __ lsr(temp0, temp1, 16);
 8276     __ lsl(s1, temp0, 4);
 8277     __ sub(s1, s1, temp0);
 8278     __ add(s1, s1, temp1, ext:: uxth);
 8279 
 8280     __ subs(temp0, s1, base);
 8281     __ csel(s1, temp0, s1, Assembler::HS);
 8282 
 8283     // s2 = s2 % BASE
 8284     __ lsr(temp0, s2, 16);
 8285     __ lsl(temp1, temp0, 4);
 8286     __ sub(temp1, temp1, temp0);
 8287     __ add(temp1, temp1, s2, ext::uxth);
 8288 
 8289     __ lsr(temp0, temp1, 16);
 8290     __ lsl(s2, temp0, 4);
 8291     __ sub(s2, s2, temp0);
 8292     __ add(s2, s2, temp1, ext:: uxth);
 8293 
 8294     __ subs(temp0, s2, base);
 8295     __ csel(s2, temp0, s2, Assembler::HS);
 8296 
 8297     // Combine lower bits and higher bits
 8298     __ bind(L_combine);
 8299     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 8300 
 8301     __ ret(lr);
 8302 
 8303     // record the stub entry and end
 8304     store_archive_data(stub_id, start, __ pc());
 8305 
 8306     return start;
 8307   }
 8308 
 8309   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 8310           Register temp0, Register temp1, FloatRegister vbytes,
 8311           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 8312     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 8313     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 8314     // In non-vectorized code, we update s1 and s2 as:
 8315     //   s1 <- s1 + b1
 8316     //   s2 <- s2 + s1
 8317     //   s1 <- s1 + b2
 8318     //   s2 <- s2 + b1
 8319     //   ...
 8320     //   s1 <- s1 + b16
 8321     //   s2 <- s2 + s1
 8322     // Putting above assignments together, we have:
 8323     //   s1_new = s1 + b1 + b2 + ... + b16
 8324     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 8325     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 8326     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 8327     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 8328 
 8329     // s2 = s2 + s1 * 16
 8330     __ add(s2, s2, s1, Assembler::LSL, 4);
 8331 
 8332     // vs1acc = b1 + b2 + b3 + ... + b16
 8333     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 8334     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 8335     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 8336     __ uaddlv(vs1acc, __ T16B, vbytes);
 8337     __ uaddlv(vs2acc, __ T8H, vs2acc);
 8338 
 8339     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 8340     __ fmovd(temp0, vs1acc);
 8341     __ fmovd(temp1, vs2acc);
 8342     __ add(s1, s1, temp0);
 8343     __ add(s2, s2, temp1);
 8344   }
 8345 
 8346   /**
 8347    *  Arguments:
 8348    *
 8349    *  Input:
 8350    *    c_rarg0   - x address
 8351    *    c_rarg1   - x length
 8352    *    c_rarg2   - y address
 8353    *    c_rarg3   - y length
 8354    *    c_rarg4   - z address
 8355    */
 8356   address generate_multiplyToLen() {
 8357     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 8358     int entry_count = StubInfo::entry_count(stub_id);
 8359     assert(entry_count == 1, "sanity check");
 8360     address start = load_archive_data(stub_id);
 8361     if (start != nullptr) {
 8362       return start;
 8363     }
 8364     __ align(CodeEntryAlignment);
 8365     StubCodeMark mark(this, stub_id);
 8366 
 8367     start = __ pc();
 8368     const Register x     = r0;
 8369     const Register xlen  = r1;
 8370     const Register y     = r2;
 8371     const Register ylen  = r3;
 8372     const Register z     = r4;
 8373 
 8374     const Register tmp0  = r5;
 8375     const Register tmp1  = r10;
 8376     const Register tmp2  = r11;
 8377     const Register tmp3  = r12;
 8378     const Register tmp4  = r13;
 8379     const Register tmp5  = r14;
 8380     const Register tmp6  = r15;
 8381     const Register tmp7  = r16;
 8382 
 8383     BLOCK_COMMENT("Entry:");
 8384     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8385     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 8386     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8387     __ ret(lr);
 8388 
 8389     // record the stub entry and end
 8390     store_archive_data(stub_id, start, __ pc());
 8391 
 8392     return start;
 8393   }
 8394 
 8395   address generate_squareToLen() {
 8396     // squareToLen algorithm for sizes 1..127 described in java code works
 8397     // faster than multiply_to_len on some CPUs and slower on others, but
 8398     // multiply_to_len shows a bit better overall results
 8399     StubId stub_id = StubId::stubgen_squareToLen_id;
 8400     int entry_count = StubInfo::entry_count(stub_id);
 8401     assert(entry_count == 1, "sanity check");
 8402     address start = load_archive_data(stub_id);
 8403     if (start != nullptr) {
 8404       return start;
 8405     }
 8406     __ align(CodeEntryAlignment);
 8407     StubCodeMark mark(this, stub_id);
 8408     start = __ pc();
 8409 
 8410     const Register x     = r0;
 8411     const Register xlen  = r1;
 8412     const Register z     = r2;
 8413     const Register y     = r4; // == x
 8414     const Register ylen  = r5; // == xlen
 8415 
 8416     const Register tmp0  = r3;
 8417     const Register tmp1  = r10;
 8418     const Register tmp2  = r11;
 8419     const Register tmp3  = r12;
 8420     const Register tmp4  = r13;
 8421     const Register tmp5  = r14;
 8422     const Register tmp6  = r15;
 8423     const Register tmp7  = r16;
 8424 
 8425     RegSet spilled_regs = RegSet::of(y, ylen);
 8426     BLOCK_COMMENT("Entry:");
 8427     __ enter();
 8428     __ push(spilled_regs, sp);
 8429     __ mov(y, x);
 8430     __ mov(ylen, xlen);
 8431     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 8432     __ pop(spilled_regs, sp);
 8433     __ leave();
 8434     __ ret(lr);
 8435 
 8436     // record the stub entry and end
 8437     store_archive_data(stub_id, start, __ pc());
 8438 
 8439     return start;
 8440   }
 8441 
 8442   address generate_mulAdd() {
 8443     StubId stub_id = StubId::stubgen_mulAdd_id;
 8444     int entry_count = StubInfo::entry_count(stub_id);
 8445     assert(entry_count == 1, "sanity check");
 8446     address start = load_archive_data(stub_id);
 8447     if (start != nullptr) {
 8448       return start;
 8449     }
 8450     __ align(CodeEntryAlignment);
 8451     StubCodeMark mark(this, stub_id);
 8452 
 8453     start = __ pc();
 8454 
 8455     const Register out     = r0;
 8456     const Register in      = r1;
 8457     const Register offset  = r2;
 8458     const Register len     = r3;
 8459     const Register k       = r4;
 8460 
 8461     BLOCK_COMMENT("Entry:");
 8462     __ enter();
 8463     __ mul_add(out, in, offset, len, k);
 8464     __ leave();
 8465     __ ret(lr);
 8466 
 8467     // record the stub entry and end
 8468     store_archive_data(stub_id, start, __ pc());
 8469 
 8470     return start;
 8471   }
 8472 
 8473   // Arguments:
 8474   //
 8475   // Input:
 8476   //   c_rarg0   - newArr address
 8477   //   c_rarg1   - oldArr address
 8478   //   c_rarg2   - newIdx
 8479   //   c_rarg3   - shiftCount
 8480   //   c_rarg4   - numIter
 8481   //
 8482   address generate_bigIntegerRightShift() {
 8483     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 8484     int entry_count = StubInfo::entry_count(stub_id);
 8485     assert(entry_count == 1, "sanity check");
 8486     address start = load_archive_data(stub_id);
 8487     if (start != nullptr) {
 8488       return start;
 8489     }
 8490     __ align(CodeEntryAlignment);
 8491     StubCodeMark mark(this, stub_id);
 8492     start = __ pc();
 8493 
 8494     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8495 
 8496     Register newArr        = c_rarg0;
 8497     Register oldArr        = c_rarg1;
 8498     Register newIdx        = c_rarg2;
 8499     Register shiftCount    = c_rarg3;
 8500     Register numIter       = c_rarg4;
 8501     Register idx           = numIter;
 8502 
 8503     Register newArrCur     = rscratch1;
 8504     Register shiftRevCount = rscratch2;
 8505     Register oldArrCur     = r13;
 8506     Register oldArrNext    = r14;
 8507 
 8508     FloatRegister oldElem0        = v0;
 8509     FloatRegister oldElem1        = v1;
 8510     FloatRegister newElem         = v2;
 8511     FloatRegister shiftVCount     = v3;
 8512     FloatRegister shiftVRevCount  = v4;
 8513 
 8514     __ cbz(idx, Exit);
 8515 
 8516     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8517 
 8518     // left shift count
 8519     __ movw(shiftRevCount, 32);
 8520     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8521 
 8522     // numIter too small to allow a 4-words SIMD loop, rolling back
 8523     __ cmp(numIter, (u1)4);
 8524     __ br(Assembler::LT, ShiftThree);
 8525 
 8526     __ dup(shiftVCount,    __ T4S, shiftCount);
 8527     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 8528     __ negr(shiftVCount,   __ T4S, shiftVCount);
 8529 
 8530     __ BIND(ShiftSIMDLoop);
 8531 
 8532     // Calculate the load addresses
 8533     __ sub(idx, idx, 4);
 8534     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8535     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8536     __ add(oldArrCur,  oldArrNext, 4);
 8537 
 8538     // Load 4 words and process
 8539     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 8540     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 8541     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8542     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8543     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8544     __ st1(newElem,   __ T4S,  Address(newArrCur));
 8545 
 8546     __ cmp(idx, (u1)4);
 8547     __ br(Assembler::LT, ShiftTwoLoop);
 8548     __ b(ShiftSIMDLoop);
 8549 
 8550     __ BIND(ShiftTwoLoop);
 8551     __ cbz(idx, Exit);
 8552     __ cmp(idx, (u1)1);
 8553     __ br(Assembler::EQ, ShiftOne);
 8554 
 8555     // Calculate the load addresses
 8556     __ sub(idx, idx, 2);
 8557     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8558     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8559     __ add(oldArrCur,  oldArrNext, 4);
 8560 
 8561     // Load 2 words and process
 8562     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 8563     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 8564     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 8565     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 8566     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 8567     __ st1(newElem,   __ T2S, Address(newArrCur));
 8568     __ b(ShiftTwoLoop);
 8569 
 8570     __ BIND(ShiftThree);
 8571     __ tbz(idx, 1, ShiftOne);
 8572     __ tbz(idx, 0, ShiftTwo);
 8573     __ ldrw(r10,  Address(oldArr, 12));
 8574     __ ldrw(r11,  Address(oldArr, 8));
 8575     __ lsrvw(r10, r10, shiftCount);
 8576     __ lslvw(r11, r11, shiftRevCount);
 8577     __ orrw(r12,  r10, r11);
 8578     __ strw(r12,  Address(newArr, 8));
 8579 
 8580     __ BIND(ShiftTwo);
 8581     __ ldrw(r10,  Address(oldArr, 8));
 8582     __ ldrw(r11,  Address(oldArr, 4));
 8583     __ lsrvw(r10, r10, shiftCount);
 8584     __ lslvw(r11, r11, shiftRevCount);
 8585     __ orrw(r12,  r10, r11);
 8586     __ strw(r12,  Address(newArr, 4));
 8587 
 8588     __ BIND(ShiftOne);
 8589     __ ldrw(r10,  Address(oldArr, 4));
 8590     __ ldrw(r11,  Address(oldArr));
 8591     __ lsrvw(r10, r10, shiftCount);
 8592     __ lslvw(r11, r11, shiftRevCount);
 8593     __ orrw(r12,  r10, r11);
 8594     __ strw(r12,  Address(newArr));
 8595 
 8596     __ BIND(Exit);
 8597     __ ret(lr);
 8598 
 8599     // record the stub entry and end
 8600     store_archive_data(stub_id, start, __ pc());
 8601 
 8602     return start;
 8603   }
 8604 
 8605   // Arguments:
 8606   //
 8607   // Input:
 8608   //   c_rarg0   - newArr address
 8609   //   c_rarg1   - oldArr address
 8610   //   c_rarg2   - newIdx
 8611   //   c_rarg3   - shiftCount
 8612   //   c_rarg4   - numIter
 8613   //
 8614   address generate_bigIntegerLeftShift() {
 8615     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8616     int entry_count = StubInfo::entry_count(stub_id);
 8617     assert(entry_count == 1, "sanity check");
 8618     address start = load_archive_data(stub_id);
 8619     if (start != nullptr) {
 8620       return start;
 8621     }
 8622     __ align(CodeEntryAlignment);
 8623     StubCodeMark mark(this, stub_id);
 8624     start = __ pc();
 8625 
 8626     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8627 
 8628     Register newArr        = c_rarg0;
 8629     Register oldArr        = c_rarg1;
 8630     Register newIdx        = c_rarg2;
 8631     Register shiftCount    = c_rarg3;
 8632     Register numIter       = c_rarg4;
 8633 
 8634     Register shiftRevCount = rscratch1;
 8635     Register oldArrNext    = rscratch2;
 8636 
 8637     FloatRegister oldElem0        = v0;
 8638     FloatRegister oldElem1        = v1;
 8639     FloatRegister newElem         = v2;
 8640     FloatRegister shiftVCount     = v3;
 8641     FloatRegister shiftVRevCount  = v4;
 8642 
 8643     __ cbz(numIter, Exit);
 8644 
 8645     __ add(oldArrNext, oldArr, 4);
 8646     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8647 
 8648     // right shift count
 8649     __ movw(shiftRevCount, 32);
 8650     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8651 
 8652     // numIter too small to allow a 4-words SIMD loop, rolling back
 8653     __ cmp(numIter, (u1)4);
 8654     __ br(Assembler::LT, ShiftThree);
 8655 
 8656     __ dup(shiftVCount,     __ T4S, shiftCount);
 8657     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8658     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8659 
 8660     __ BIND(ShiftSIMDLoop);
 8661 
 8662     // load 4 words and process
 8663     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8664     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8665     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8666     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8667     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8668     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8669     __ sub(numIter,   numIter, 4);
 8670 
 8671     __ cmp(numIter, (u1)4);
 8672     __ br(Assembler::LT, ShiftTwoLoop);
 8673     __ b(ShiftSIMDLoop);
 8674 
 8675     __ BIND(ShiftTwoLoop);
 8676     __ cbz(numIter, Exit);
 8677     __ cmp(numIter, (u1)1);
 8678     __ br(Assembler::EQ, ShiftOne);
 8679 
 8680     // load 2 words and process
 8681     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8682     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8683     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8684     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8685     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8686     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8687     __ sub(numIter,   numIter, 2);
 8688     __ b(ShiftTwoLoop);
 8689 
 8690     __ BIND(ShiftThree);
 8691     __ ldrw(r10,  __ post(oldArr, 4));
 8692     __ ldrw(r11,  __ post(oldArrNext, 4));
 8693     __ lslvw(r10, r10, shiftCount);
 8694     __ lsrvw(r11, r11, shiftRevCount);
 8695     __ orrw(r12,  r10, r11);
 8696     __ strw(r12,  __ post(newArr, 4));
 8697     __ tbz(numIter, 1, Exit);
 8698     __ tbz(numIter, 0, ShiftOne);
 8699 
 8700     __ BIND(ShiftTwo);
 8701     __ ldrw(r10,  __ post(oldArr, 4));
 8702     __ ldrw(r11,  __ post(oldArrNext, 4));
 8703     __ lslvw(r10, r10, shiftCount);
 8704     __ lsrvw(r11, r11, shiftRevCount);
 8705     __ orrw(r12,  r10, r11);
 8706     __ strw(r12,  __ post(newArr, 4));
 8707 
 8708     __ BIND(ShiftOne);
 8709     __ ldrw(r10,  Address(oldArr));
 8710     __ ldrw(r11,  Address(oldArrNext));
 8711     __ lslvw(r10, r10, shiftCount);
 8712     __ lsrvw(r11, r11, shiftRevCount);
 8713     __ orrw(r12,  r10, r11);
 8714     __ strw(r12,  Address(newArr));
 8715 
 8716     __ BIND(Exit);
 8717     __ ret(lr);
 8718 
 8719     // record the stub entry and end
 8720     store_archive_data(stub_id, start, __ pc());
 8721 
 8722     return start;
 8723   }
 8724 
 8725   address generate_count_positives(address &count_positives_long) {
 8726     StubId stub_id = StubId::stubgen_count_positives_id;
 8727     GrowableArray<address> entries;
 8728     int entry_count = StubInfo::entry_count(stub_id);
 8729     // We have an extra entry for count_positives_long.
 8730     assert(entry_count == 2, "sanity check");
 8731     address start = load_archive_data(stub_id, &entries);
 8732     if (start != nullptr) {
 8733       assert(entries.length() == 1,
 8734              "unexpected extra entry count %d", entries.length());
 8735       count_positives_long = entries.at(0);
 8736       return start;
 8737     }
 8738     const u1 large_loop_size = 64;
 8739     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8740     int dcache_line = VM_Version::dcache_line_size();
 8741 
 8742     Register ary1 = r1, len = r2, result = r0;
 8743 
 8744     __ align(CodeEntryAlignment);
 8745     StubCodeMark mark(this, stub_id);
 8746 
 8747     address entry = __ pc();
 8748 
 8749     __ enter();
 8750     // precondition: a copy of len is already in result
 8751     // __ mov(result, len);
 8752 
 8753   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8754         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8755 
 8756   __ cmp(len, (u1)15);
 8757   __ br(Assembler::GT, LEN_OVER_15);
 8758   // The only case when execution falls into this code is when pointer is near
 8759   // the end of memory page and we have to avoid reading next page
 8760   __ add(ary1, ary1, len);
 8761   __ subs(len, len, 8);
 8762   __ br(Assembler::GT, LEN_OVER_8);
 8763   __ ldr(rscratch2, Address(ary1, -8));
 8764   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8765   __ lsrv(rscratch2, rscratch2, rscratch1);
 8766   __ tst(rscratch2, UPPER_BIT_MASK);
 8767   __ csel(result, zr, result, Assembler::NE);
 8768   __ leave();
 8769   __ ret(lr);
 8770   __ bind(LEN_OVER_8);
 8771   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8772   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8773   __ tst(rscratch2, UPPER_BIT_MASK);
 8774   __ br(Assembler::NE, RET_NO_POP);
 8775   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8776   __ lsrv(rscratch1, rscratch1, rscratch2);
 8777   __ tst(rscratch1, UPPER_BIT_MASK);
 8778   __ bind(RET_NO_POP);
 8779   __ csel(result, zr, result, Assembler::NE);
 8780   __ leave();
 8781   __ ret(lr);
 8782 
 8783   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8784   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8785 
 8786   count_positives_long = __ pc(); // 2nd entry point
 8787   entries.append(count_positives_long);
 8788 
 8789   __ enter();
 8790 
 8791   __ bind(LEN_OVER_15);
 8792     __ push(spilled_regs, sp);
 8793     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8794     __ cbz(rscratch2, ALIGNED);
 8795     __ ldp(tmp6, tmp1, Address(ary1));
 8796     __ mov(tmp5, 16);
 8797     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8798     __ add(ary1, ary1, rscratch1);
 8799     __ orr(tmp6, tmp6, tmp1);
 8800     __ tst(tmp6, UPPER_BIT_MASK);
 8801     __ br(Assembler::NE, RET_ADJUST);
 8802     __ sub(len, len, rscratch1);
 8803 
 8804   __ bind(ALIGNED);
 8805     __ cmp(len, large_loop_size);
 8806     __ br(Assembler::LT, CHECK_16);
 8807     // Perform 16-byte load as early return in pre-loop to handle situation
 8808     // when initially aligned large array has negative values at starting bytes,
 8809     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8810     // slower. Cases with negative bytes further ahead won't be affected that
 8811     // much. In fact, it'll be faster due to early loads, less instructions and
 8812     // less branches in LARGE_LOOP.
 8813     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8814     __ sub(len, len, 16);
 8815     __ orr(tmp6, tmp6, tmp1);
 8816     __ tst(tmp6, UPPER_BIT_MASK);
 8817     __ br(Assembler::NE, RET_ADJUST_16);
 8818     __ cmp(len, large_loop_size);
 8819     __ br(Assembler::LT, CHECK_16);
 8820 
 8821     if (SoftwarePrefetchHintDistance >= 0
 8822         && SoftwarePrefetchHintDistance >= dcache_line) {
 8823       // initial prefetch
 8824       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8825     }
 8826   __ bind(LARGE_LOOP);
 8827     if (SoftwarePrefetchHintDistance >= 0) {
 8828       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8829     }
 8830     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8831     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8832     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8833     // instructions per cycle and have less branches, but this approach disables
 8834     // early return, thus, all 64 bytes are loaded and checked every time.
 8835     __ ldp(tmp2, tmp3, Address(ary1));
 8836     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8837     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8838     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8839     __ add(ary1, ary1, large_loop_size);
 8840     __ sub(len, len, large_loop_size);
 8841     __ orr(tmp2, tmp2, tmp3);
 8842     __ orr(tmp4, tmp4, tmp5);
 8843     __ orr(rscratch1, rscratch1, rscratch2);
 8844     __ orr(tmp6, tmp6, tmp1);
 8845     __ orr(tmp2, tmp2, tmp4);
 8846     __ orr(rscratch1, rscratch1, tmp6);
 8847     __ orr(tmp2, tmp2, rscratch1);
 8848     __ tst(tmp2, UPPER_BIT_MASK);
 8849     __ br(Assembler::NE, RET_ADJUST_LONG);
 8850     __ cmp(len, large_loop_size);
 8851     __ br(Assembler::GE, LARGE_LOOP);
 8852 
 8853   __ bind(CHECK_16); // small 16-byte load pre-loop
 8854     __ cmp(len, (u1)16);
 8855     __ br(Assembler::LT, POST_LOOP16);
 8856 
 8857   __ bind(LOOP16); // small 16-byte load loop
 8858     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8859     __ sub(len, len, 16);
 8860     __ orr(tmp2, tmp2, tmp3);
 8861     __ tst(tmp2, UPPER_BIT_MASK);
 8862     __ br(Assembler::NE, RET_ADJUST_16);
 8863     __ cmp(len, (u1)16);
 8864     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8865 
 8866   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8867     __ cmp(len, (u1)8);
 8868     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8869     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8870     __ tst(tmp3, UPPER_BIT_MASK);
 8871     __ br(Assembler::NE, RET_ADJUST);
 8872     __ sub(len, len, 8);
 8873 
 8874   __ bind(POST_LOOP16_LOAD_TAIL);
 8875     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8876     __ ldr(tmp1, Address(ary1));
 8877     __ mov(tmp2, 64);
 8878     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8879     __ lslv(tmp1, tmp1, tmp4);
 8880     __ tst(tmp1, UPPER_BIT_MASK);
 8881     __ br(Assembler::NE, RET_ADJUST);
 8882     // Fallthrough
 8883 
 8884   __ bind(RET_LEN);
 8885     __ pop(spilled_regs, sp);
 8886     __ leave();
 8887     __ ret(lr);
 8888 
 8889     // difference result - len is the count of guaranteed to be
 8890     // positive bytes
 8891 
 8892   __ bind(RET_ADJUST_LONG);
 8893     __ add(len, len, (u1)(large_loop_size - 16));
 8894   __ bind(RET_ADJUST_16);
 8895     __ add(len, len, 16);
 8896   __ bind(RET_ADJUST);
 8897     __ pop(spilled_regs, sp);
 8898     __ leave();
 8899     __ sub(result, result, len);
 8900     __ ret(lr);
 8901 
 8902     // record the stub entry and end plus the extra entry
 8903     store_archive_data(stub_id, entry, __ pc(), &entries);
 8904 
 8905     return entry;
 8906   }
 8907 
 8908   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8909         bool usePrefetch, Label &NOT_EQUAL) {
 8910     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8911         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8912         tmp7 = r12, tmp8 = r13;
 8913     Label LOOP;
 8914 
 8915     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8916     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8917     __ bind(LOOP);
 8918     if (usePrefetch) {
 8919       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8920       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8921     }
 8922     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8923     __ eor(tmp1, tmp1, tmp2);
 8924     __ eor(tmp3, tmp3, tmp4);
 8925     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8926     __ orr(tmp1, tmp1, tmp3);
 8927     __ cbnz(tmp1, NOT_EQUAL);
 8928     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8929     __ eor(tmp5, tmp5, tmp6);
 8930     __ eor(tmp7, tmp7, tmp8);
 8931     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8932     __ orr(tmp5, tmp5, tmp7);
 8933     __ cbnz(tmp5, NOT_EQUAL);
 8934     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8935     __ eor(tmp1, tmp1, tmp2);
 8936     __ eor(tmp3, tmp3, tmp4);
 8937     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8938     __ orr(tmp1, tmp1, tmp3);
 8939     __ cbnz(tmp1, NOT_EQUAL);
 8940     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8941     __ eor(tmp5, tmp5, tmp6);
 8942     __ sub(cnt1, cnt1, 8 * wordSize);
 8943     __ eor(tmp7, tmp7, tmp8);
 8944     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8945     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8946     // cmp) because subs allows an unlimited range of immediate operand.
 8947     __ subs(tmp6, cnt1, loopThreshold);
 8948     __ orr(tmp5, tmp5, tmp7);
 8949     __ cbnz(tmp5, NOT_EQUAL);
 8950     __ br(__ GE, LOOP);
 8951     // post-loop
 8952     __ eor(tmp1, tmp1, tmp2);
 8953     __ eor(tmp3, tmp3, tmp4);
 8954     __ orr(tmp1, tmp1, tmp3);
 8955     __ sub(cnt1, cnt1, 2 * wordSize);
 8956     __ cbnz(tmp1, NOT_EQUAL);
 8957   }
 8958 
 8959   void generate_large_array_equals_loop_simd(int loopThreshold,
 8960         bool usePrefetch, Label &NOT_EQUAL) {
 8961     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8962         tmp2 = rscratch2;
 8963     Label LOOP;
 8964 
 8965     __ bind(LOOP);
 8966     if (usePrefetch) {
 8967       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8968       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8969     }
 8970     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8971     __ sub(cnt1, cnt1, 8 * wordSize);
 8972     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8973     __ subs(tmp1, cnt1, loopThreshold);
 8974     __ eor(v0, __ T16B, v0, v4);
 8975     __ eor(v1, __ T16B, v1, v5);
 8976     __ eor(v2, __ T16B, v2, v6);
 8977     __ eor(v3, __ T16B, v3, v7);
 8978     __ orr(v0, __ T16B, v0, v1);
 8979     __ orr(v1, __ T16B, v2, v3);
 8980     __ orr(v0, __ T16B, v0, v1);
 8981     __ umov(tmp1, v0, __ D, 0);
 8982     __ umov(tmp2, v0, __ D, 1);
 8983     __ orr(tmp1, tmp1, tmp2);
 8984     __ cbnz(tmp1, NOT_EQUAL);
 8985     __ br(__ GE, LOOP);
 8986   }
 8987 
 8988   // a1 = r1 - array1 address
 8989   // a2 = r2 - array2 address
 8990   // result = r0 - return value. Already contains "false"
 8991   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8992   // r3-r5 are reserved temporary registers
 8993   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8994   address generate_large_array_equals() {
 8995     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8996     int entry_count = StubInfo::entry_count(stub_id);
 8997     assert(entry_count == 1, "sanity check");
 8998     address start = load_archive_data(stub_id);
 8999     if (start != nullptr) {
 9000       return start;
 9001     }
 9002     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 9003         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 9004         tmp7 = r12, tmp8 = r13;
 9005     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 9006         SMALL_LOOP, POST_LOOP;
 9007     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 9008     // calculate if at least 32 prefetched bytes are used
 9009     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 9010     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 9011     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 9012     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 9013         tmp5, tmp6, tmp7, tmp8);
 9014 
 9015     __ align(CodeEntryAlignment);
 9016 
 9017     StubCodeMark mark(this, stub_id);
 9018 
 9019     address entry = __ pc();
 9020     __ enter();
 9021     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 9022     // also advance pointers to use post-increment instead of pre-increment
 9023     __ add(a1, a1, wordSize);
 9024     __ add(a2, a2, wordSize);
 9025     if (AvoidUnalignedAccesses) {
 9026       // both implementations (SIMD/nonSIMD) are using relatively large load
 9027       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 9028       // on some CPUs in case of address is not at least 16-byte aligned.
 9029       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 9030       // load if needed at least for 1st address and make if 16-byte aligned.
 9031       Label ALIGNED16;
 9032       __ tbz(a1, 3, ALIGNED16);
 9033       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 9034       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 9035       __ sub(cnt1, cnt1, wordSize);
 9036       __ eor(tmp1, tmp1, tmp2);
 9037       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 9038       __ bind(ALIGNED16);
 9039     }
 9040     if (UseSIMDForArrayEquals) {
 9041       if (SoftwarePrefetchHintDistance >= 0) {
 9042         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 9043         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 9044         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 9045             /* prfm = */ true, NOT_EQUAL);
 9046         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 9047         __ br(__ LT, TAIL);
 9048       }
 9049       __ bind(NO_PREFETCH_LARGE_LOOP);
 9050       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 9051           /* prfm = */ false, NOT_EQUAL);
 9052     } else {
 9053       __ push(spilled_regs, sp);
 9054       if (SoftwarePrefetchHintDistance >= 0) {
 9055         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 9056         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 9057         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 9058             /* prfm = */ true, NOT_EQUAL);
 9059         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 9060         __ br(__ LT, TAIL);
 9061       }
 9062       __ bind(NO_PREFETCH_LARGE_LOOP);
 9063       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 9064           /* prfm = */ false, NOT_EQUAL);
 9065     }
 9066     __ bind(TAIL);
 9067       __ cbz(cnt1, EQUAL);
 9068       __ subs(cnt1, cnt1, wordSize);
 9069       __ br(__ LE, POST_LOOP);
 9070     __ bind(SMALL_LOOP);
 9071       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 9072       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 9073       __ subs(cnt1, cnt1, wordSize);
 9074       __ eor(tmp1, tmp1, tmp2);
 9075       __ cbnz(tmp1, NOT_EQUAL);
 9076       __ br(__ GT, SMALL_LOOP);
 9077     __ bind(POST_LOOP);
 9078       __ ldr(tmp1, Address(a1, cnt1));
 9079       __ ldr(tmp2, Address(a2, cnt1));
 9080       __ eor(tmp1, tmp1, tmp2);
 9081       __ cbnz(tmp1, NOT_EQUAL);
 9082     __ bind(EQUAL);
 9083       __ mov(result, true);
 9084     __ bind(NOT_EQUAL);
 9085       if (!UseSIMDForArrayEquals) {
 9086         __ pop(spilled_regs, sp);
 9087       }
 9088     __ bind(NOT_EQUAL_NO_POP);
 9089     __ leave();
 9090     __ ret(lr);
 9091 
 9092     // record the stub entry and end
 9093     store_archive_data(stub_id, entry, __ pc());
 9094 
 9095     return entry;
 9096   }
 9097 
 9098   // result = r0 - return value. Contains initial hashcode value on entry.
 9099   // ary = r1 - array address
 9100   // cnt = r2 - elements count
 9101   // Clobbers: v0-v13, rscratch1, rscratch2
 9102   address generate_large_arrays_hashcode(BasicType eltype) {
 9103     StubId stub_id;
 9104     switch (eltype) {
 9105     case T_BOOLEAN:
 9106       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 9107       break;
 9108     case T_BYTE:
 9109       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 9110       break;
 9111     case T_CHAR:
 9112       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 9113       break;
 9114     case T_SHORT:
 9115       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 9116       break;
 9117     case T_INT:
 9118       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 9119       break;
 9120     default:
 9121       stub_id = StubId::NO_STUBID;
 9122       ShouldNotReachHere();
 9123     };
 9124     int entry_count = StubInfo::entry_count(stub_id);
 9125     assert(entry_count == 1, "sanity check");
 9126     address start = load_archive_data(stub_id);
 9127     if (start != nullptr) {
 9128       return start;
 9129     }
 9130     const Register result = r0, ary = r1, cnt = r2;
 9131     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 9132     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 9133     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 9134     const FloatRegister vpowm = v13;
 9135 
 9136     ARRAYS_HASHCODE_REGISTERS;
 9137 
 9138     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 9139 
 9140     unsigned int vf; // vectorization factor
 9141     bool multiply_by_halves;
 9142     Assembler::SIMD_Arrangement load_arrangement;
 9143     switch (eltype) {
 9144     case T_BOOLEAN:
 9145     case T_BYTE:
 9146       load_arrangement = Assembler::T8B;
 9147       multiply_by_halves = true;
 9148       vf = 8;
 9149       break;
 9150     case T_CHAR:
 9151     case T_SHORT:
 9152       load_arrangement = Assembler::T8H;
 9153       multiply_by_halves = true;
 9154       vf = 8;
 9155       break;
 9156     case T_INT:
 9157       load_arrangement = Assembler::T4S;
 9158       multiply_by_halves = false;
 9159       vf = 4;
 9160       break;
 9161     default:
 9162       ShouldNotReachHere();
 9163     }
 9164 
 9165     // Unroll factor
 9166     const unsigned uf = 4;
 9167 
 9168     // Effective vectorization factor
 9169     const unsigned evf = vf * uf;
 9170 
 9171     __ align(CodeEntryAlignment);
 9172 
 9173     StubCodeMark mark(this, stub_id);
 9174 
 9175     address entry = __ pc();
 9176     __ enter();
 9177 
 9178     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 9179     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 9180     // value shouldn't change throughout both loops.
 9181     __ movw(rscratch1, intpow(31U, 3));
 9182     __ mov(vpow, Assembler::S, 0, rscratch1);
 9183     __ movw(rscratch1, intpow(31U, 2));
 9184     __ mov(vpow, Assembler::S, 1, rscratch1);
 9185     __ movw(rscratch1, intpow(31U, 1));
 9186     __ mov(vpow, Assembler::S, 2, rscratch1);
 9187     __ movw(rscratch1, intpow(31U, 0));
 9188     __ mov(vpow, Assembler::S, 3, rscratch1);
 9189 
 9190     __ mov(vmul0, Assembler::T16B, 0);
 9191     __ mov(vmul0, Assembler::S, 3, result);
 9192 
 9193     __ andr(rscratch2, cnt, (uf - 1) * vf);
 9194     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 9195 
 9196     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 9197     __ mov(vpowm, Assembler::S, 0, rscratch1);
 9198 
 9199     // SMALL LOOP
 9200     __ bind(SMALL_LOOP);
 9201 
 9202     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 9203     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9204     __ subsw(rscratch2, rscratch2, vf);
 9205 
 9206     if (load_arrangement == Assembler::T8B) {
 9207       // Extend 8B to 8H to be able to use vector multiply
 9208       // instructions
 9209       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 9210       if (is_signed_subword_type(eltype)) {
 9211         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9212       } else {
 9213         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9214       }
 9215     }
 9216 
 9217     switch (load_arrangement) {
 9218     case Assembler::T4S:
 9219       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 9220       break;
 9221     case Assembler::T8B:
 9222     case Assembler::T8H:
 9223       assert(is_subword_type(eltype), "subword type expected");
 9224       if (is_signed_subword_type(eltype)) {
 9225         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9226       } else {
 9227         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9228       }
 9229       break;
 9230     default:
 9231       __ should_not_reach_here();
 9232     }
 9233 
 9234     // Process the upper half of a vector
 9235     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 9236       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9237       if (is_signed_subword_type(eltype)) {
 9238         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9239       } else {
 9240         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9241       }
 9242     }
 9243 
 9244     __ br(Assembler::HI, SMALL_LOOP);
 9245 
 9246     // SMALL LOOP'S EPILOQUE
 9247     __ lsr(rscratch2, cnt, exact_log2(evf));
 9248     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 9249 
 9250     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 9251     __ addv(vmul0, Assembler::T4S, vmul0);
 9252     __ umov(result, vmul0, Assembler::S, 0);
 9253 
 9254     // TAIL
 9255     __ bind(TAIL);
 9256 
 9257     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 9258     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 9259     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 9260     __ andr(rscratch2, cnt, vf - 1);
 9261     __ bind(TAIL_SHORTCUT);
 9262     __ adr(rscratch1, BR_BASE);
 9263     // For Cortex-A53 offset is 4 because 2 nops are generated.
 9264     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 9265     __ movw(rscratch2, 0x1f);
 9266     __ br(rscratch1);
 9267 
 9268     for (size_t i = 0; i < vf - 1; ++i) {
 9269       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 9270                                    eltype);
 9271       __ maddw(result, result, rscratch2, rscratch1);
 9272       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 9273       // Generate 2nd nop to have 4 instructions per iteration.
 9274       if (VM_Version::supports_a53mac()) {
 9275         __ nop();
 9276       }
 9277     }
 9278     __ bind(BR_BASE);
 9279 
 9280     __ leave();
 9281     __ ret(lr);
 9282 
 9283     // LARGE LOOP
 9284     __ bind(LARGE_LOOP_PREHEADER);
 9285 
 9286     __ lsr(rscratch2, cnt, exact_log2(evf));
 9287 
 9288     if (multiply_by_halves) {
 9289       // 31^4 - multiplier between lower and upper parts of a register
 9290       __ movw(rscratch1, intpow(31U, vf / 2));
 9291       __ mov(vpowm, Assembler::S, 1, rscratch1);
 9292       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 9293       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 9294       __ mov(vpowm, Assembler::S, 0, rscratch1);
 9295     } else {
 9296       // 31^16
 9297       __ movw(rscratch1, intpow(31U, evf));
 9298       __ mov(vpowm, Assembler::S, 0, rscratch1);
 9299     }
 9300 
 9301     __ mov(vmul3, Assembler::T16B, 0);
 9302     __ mov(vmul2, Assembler::T16B, 0);
 9303     __ mov(vmul1, Assembler::T16B, 0);
 9304 
 9305     __ bind(LARGE_LOOP);
 9306 
 9307     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 9308     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 9309     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 9310     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9311 
 9312     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 9313            Address(__ post(ary, evf * type2aelembytes(eltype))));
 9314 
 9315     if (load_arrangement == Assembler::T8B) {
 9316       // Extend 8B to 8H to be able to use vector multiply
 9317       // instructions
 9318       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 9319       if (is_signed_subword_type(eltype)) {
 9320         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 9321         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 9322         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 9323         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9324       } else {
 9325         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 9326         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 9327         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 9328         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9329       }
 9330     }
 9331 
 9332     switch (load_arrangement) {
 9333     case Assembler::T4S:
 9334       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 9335       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 9336       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 9337       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 9338       break;
 9339     case Assembler::T8B:
 9340     case Assembler::T8H:
 9341       assert(is_subword_type(eltype), "subword type expected");
 9342       if (is_signed_subword_type(eltype)) {
 9343         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 9344         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 9345         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 9346         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9347       } else {
 9348         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 9349         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 9350         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 9351         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9352       }
 9353       break;
 9354     default:
 9355       __ should_not_reach_here();
 9356     }
 9357 
 9358     // Process the upper half of a vector
 9359     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 9360       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 9361       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 9362       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 9363       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 9364       if (is_signed_subword_type(eltype)) {
 9365         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 9366         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 9367         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 9368         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9369       } else {
 9370         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 9371         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 9372         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 9373         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9374       }
 9375     }
 9376 
 9377     __ subsw(rscratch2, rscratch2, 1);
 9378     __ br(Assembler::HI, LARGE_LOOP);
 9379 
 9380     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 9381     __ addv(vmul3, Assembler::T4S, vmul3);
 9382     __ umov(result, vmul3, Assembler::S, 0);
 9383 
 9384     __ mov(rscratch2, intpow(31U, vf));
 9385 
 9386     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 9387     __ addv(vmul2, Assembler::T4S, vmul2);
 9388     __ umov(rscratch1, vmul2, Assembler::S, 0);
 9389     __ maddw(result, result, rscratch2, rscratch1);
 9390 
 9391     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 9392     __ addv(vmul1, Assembler::T4S, vmul1);
 9393     __ umov(rscratch1, vmul1, Assembler::S, 0);
 9394     __ maddw(result, result, rscratch2, rscratch1);
 9395 
 9396     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 9397     __ addv(vmul0, Assembler::T4S, vmul0);
 9398     __ umov(rscratch1, vmul0, Assembler::S, 0);
 9399     __ maddw(result, result, rscratch2, rscratch1);
 9400 
 9401     __ andr(rscratch2, cnt, vf - 1);
 9402     __ cbnz(rscratch2, TAIL_SHORTCUT);
 9403 
 9404     __ leave();
 9405     __ ret(lr);
 9406 
 9407     // record the stub entry and end
 9408     store_archive_data(stub_id, entry, __ pc());
 9409 
 9410     return entry;
 9411   }
 9412 
 9413   address generate_dsin_dcos(bool isCos) {
 9414     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 9415     int entry_count = StubInfo::entry_count(stub_id);
 9416     assert(entry_count == 1, "sanity check");
 9417     address start = load_archive_data(stub_id);
 9418     if (start != nullptr) {
 9419       return start;
 9420     }
 9421     __ align(CodeEntryAlignment);
 9422     StubCodeMark mark(this, stub_id);
 9423     start = __ pc();
 9424     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 9425         (address)StubRoutines::aarch64::_two_over_pi,
 9426         (address)StubRoutines::aarch64::_pio2,
 9427         (address)StubRoutines::aarch64::_dsin_coef,
 9428         (address)StubRoutines::aarch64::_dcos_coef);
 9429 
 9430     // record the stub entry and end
 9431     store_archive_data(stub_id, start, __ pc());
 9432 
 9433     return start;
 9434   }
 9435 
 9436   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 9437   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 9438       Label &DIFF2) {
 9439     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 9440     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 9441 
 9442     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 9443     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 9444     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 9445     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 9446 
 9447     __ fmovd(tmpL, vtmp3);
 9448     __ eor(rscratch2, tmp3, tmpL);
 9449     __ cbnz(rscratch2, DIFF2);
 9450 
 9451     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9452     __ umov(tmpL, vtmp3, __ D, 1);
 9453     __ eor(rscratch2, tmpU, tmpL);
 9454     __ cbnz(rscratch2, DIFF1);
 9455 
 9456     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 9457     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 9458     __ fmovd(tmpL, vtmp);
 9459     __ eor(rscratch2, tmp3, tmpL);
 9460     __ cbnz(rscratch2, DIFF2);
 9461 
 9462     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9463     __ umov(tmpL, vtmp, __ D, 1);
 9464     __ eor(rscratch2, tmpU, tmpL);
 9465     __ cbnz(rscratch2, DIFF1);
 9466   }
 9467 
 9468   // r0  = result
 9469   // r1  = str1
 9470   // r2  = cnt1
 9471   // r3  = str2
 9472   // r4  = cnt2
 9473   // r10 = tmp1
 9474   // r11 = tmp2
 9475   address generate_compare_long_string_different_encoding(bool isLU) {
 9476     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 9477     int entry_count = StubInfo::entry_count(stub_id);
 9478     assert(entry_count == 1, "sanity check");
 9479     address start = load_archive_data(stub_id);
 9480     if (start != nullptr) {
 9481       return start;
 9482     }
 9483     __ align(CodeEntryAlignment);
 9484     StubCodeMark mark(this, stub_id);
 9485     address entry = __ pc();
 9486     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 9487         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 9488         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 9489     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9490         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 9491     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 9492     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 9493 
 9494     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 9495 
 9496     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 9497     // cnt2 == amount of characters left to compare
 9498     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 9499     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 9500     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 9501     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 9502     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 9503     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 9504     __ eor(rscratch2, tmp1, tmp2);
 9505     __ mov(rscratch1, tmp2);
 9506     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 9507     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 9508              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 9509     __ push(spilled_regs, sp);
 9510     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 9511     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 9512 
 9513     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9514 
 9515     if (SoftwarePrefetchHintDistance >= 0) {
 9516       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 9517       __ br(__ LT, NO_PREFETCH);
 9518       __ bind(LARGE_LOOP_PREFETCH);
 9519         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 9520         __ mov(tmp4, 2);
 9521         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 9522         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 9523           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9524           __ subs(tmp4, tmp4, 1);
 9525           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 9526           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 9527           __ mov(tmp4, 2);
 9528         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 9529           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9530           __ subs(tmp4, tmp4, 1);
 9531           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 9532           __ sub(cnt2, cnt2, 64);
 9533           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 9534           __ br(__ GE, LARGE_LOOP_PREFETCH);
 9535     }
 9536     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 9537     __ bind(NO_PREFETCH);
 9538     __ subs(cnt2, cnt2, 16);
 9539     __ br(__ LT, TAIL);
 9540     __ align(OptoLoopAlignment);
 9541     __ bind(SMALL_LOOP); // smaller loop
 9542       __ subs(cnt2, cnt2, 16);
 9543       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9544       __ br(__ GE, SMALL_LOOP);
 9545       __ cmn(cnt2, (u1)16);
 9546       __ br(__ EQ, LOAD_LAST);
 9547     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 9548       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 9549       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 9550       __ ldr(tmp3, Address(cnt1, -8));
 9551       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 9552       __ b(LOAD_LAST);
 9553     __ bind(DIFF2);
 9554       __ mov(tmpU, tmp3);
 9555     __ bind(DIFF1);
 9556       __ pop(spilled_regs, sp);
 9557       __ b(CALCULATE_DIFFERENCE);
 9558     __ bind(LOAD_LAST);
 9559       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 9560       // No need to load it again
 9561       __ mov(tmpU, tmp3);
 9562       __ pop(spilled_regs, sp);
 9563 
 9564       // tmp2 points to the address of the last 4 Latin1 characters right now
 9565       __ ldrs(vtmp, Address(tmp2));
 9566       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 9567       __ fmovd(tmpL, vtmp);
 9568 
 9569       __ eor(rscratch2, tmpU, tmpL);
 9570       __ cbz(rscratch2, DONE);
 9571 
 9572     // Find the first different characters in the longwords and
 9573     // compute their difference.
 9574     __ bind(CALCULATE_DIFFERENCE);
 9575       __ rev(rscratch2, rscratch2);
 9576       __ clz(rscratch2, rscratch2);
 9577       __ andr(rscratch2, rscratch2, -16);
 9578       __ lsrv(tmp1, tmp1, rscratch2);
 9579       __ uxthw(tmp1, tmp1);
 9580       __ lsrv(rscratch1, rscratch1, rscratch2);
 9581       __ uxthw(rscratch1, rscratch1);
 9582       __ subw(result, tmp1, rscratch1);
 9583     __ bind(DONE);
 9584       __ ret(lr);
 9585 
 9586       // record the stub entry and end
 9587       store_archive_data(stub_id, entry, __ pc());
 9588 
 9589       return entry;
 9590   }
 9591 
 9592   // r0 = input (float16)
 9593   // v0 = result (float)
 9594   // v1 = temporary float register
 9595   address generate_float16ToFloat() {
 9596     StubId stub_id = StubId::stubgen_hf2f_id;
 9597     int entry_count = StubInfo::entry_count(stub_id);
 9598     assert(entry_count == 1, "sanity check");
 9599     address start = load_archive_data(stub_id);
 9600     if (start != nullptr) {
 9601       return start;
 9602     }
 9603     __ align(CodeEntryAlignment);
 9604     StubCodeMark mark(this, stub_id);
 9605     address entry = __ pc();
 9606     BLOCK_COMMENT("Entry:");
 9607     __ flt16_to_flt(v0, r0, v1);
 9608     __ ret(lr);
 9609 
 9610     // record the stub entry and end
 9611     store_archive_data(stub_id, entry, __ pc());
 9612 
 9613     return entry;
 9614   }
 9615 
 9616   // v0 = input (float)
 9617   // r0 = result (float16)
 9618   // v1 = temporary float register
 9619   address generate_floatToFloat16() {
 9620     StubId stub_id = StubId::stubgen_f2hf_id;
 9621     int entry_count = StubInfo::entry_count(stub_id);
 9622     assert(entry_count == 1, "sanity check");
 9623     address start = load_archive_data(stub_id);
 9624     if (start != nullptr) {
 9625       return start;
 9626     }
 9627     __ align(CodeEntryAlignment);
 9628     StubCodeMark mark(this, stub_id);
 9629     address entry = __ pc();
 9630     BLOCK_COMMENT("Entry:");
 9631     __ flt_to_flt16(r0, v0, v1);
 9632     __ ret(lr);
 9633 
 9634     // record the stub entry and end
 9635     store_archive_data(stub_id, entry, __ pc());
 9636 
 9637     return entry;
 9638   }
 9639 
 9640   address generate_method_entry_barrier() {
 9641     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 9642     int entry_count = StubInfo::entry_count(stub_id);
 9643     assert(entry_count == 1, "sanity check");
 9644     address start = load_archive_data(stub_id);
 9645     if (start != nullptr) {
 9646       return start;
 9647     }
 9648     __ align(CodeEntryAlignment);
 9649     StubCodeMark mark(this, stub_id);
 9650 
 9651     Label deoptimize_label;
 9652 
 9653     start = __ pc();
 9654 
 9655     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 9656 
 9657     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 9658       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 9659       // We can get here despite the nmethod being good, if we have not
 9660       // yet applied our cross modification fence (or data fence).
 9661       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 9662       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 9663       __ ldrw(rscratch2, rscratch2);
 9664       __ strw(rscratch2, thread_epoch_addr);
 9665       __ isb();
 9666       __ membar(__ LoadLoad);
 9667     }
 9668 
 9669     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 9670 
 9671     __ enter();
 9672     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 9673 
 9674     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 9675 
 9676     __ push_call_clobbered_registers();
 9677 
 9678     __ mov(c_rarg0, rscratch2);
 9679     __ call_VM_leaf
 9680          (CAST_FROM_FN_PTR
 9681           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 9682 
 9683     __ reset_last_Java_frame(true);
 9684 
 9685     __ mov(rscratch1, r0);
 9686 
 9687     __ pop_call_clobbered_registers();
 9688 
 9689     __ cbnz(rscratch1, deoptimize_label);
 9690 
 9691     __ leave();
 9692     __ ret(lr);
 9693 
 9694     __ BIND(deoptimize_label);
 9695 
 9696     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9697     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9698 
 9699     __ mov(sp, rscratch1);
 9700     __ br(rscratch2);
 9701 
 9702     // record the stub entry and end
 9703     store_archive_data(stub_id, start, __ pc());
 9704 
 9705     return start;
 9706   }
 9707 
 9708   // r0  = result
 9709   // r1  = str1
 9710   // r2  = cnt1
 9711   // r3  = str2
 9712   // r4  = cnt2
 9713   // r10 = tmp1
 9714   // r11 = tmp2
 9715   address generate_compare_long_string_same_encoding(bool isLL) {
 9716     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9717     int entry_count = StubInfo::entry_count(stub_id);
 9718     assert(entry_count == 1, "sanity check");
 9719     address start = load_archive_data(stub_id);
 9720     if (start != nullptr) {
 9721       return start;
 9722     }
 9723     __ align(CodeEntryAlignment);
 9724     StubCodeMark mark(this, stub_id);
 9725     address entry = __ pc();
 9726     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9727         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9728 
 9729     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9730 
 9731     // exit from large loop when less than 64 bytes left to read or we're about
 9732     // to prefetch memory behind array border
 9733     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9734 
 9735     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9736     __ eor(rscratch2, tmp1, tmp2);
 9737     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9738 
 9739     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9740     // update pointers, because of previous read
 9741     __ add(str1, str1, wordSize);
 9742     __ add(str2, str2, wordSize);
 9743     if (SoftwarePrefetchHintDistance >= 0) {
 9744       __ align(OptoLoopAlignment);
 9745       __ bind(LARGE_LOOP_PREFETCH);
 9746         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9747         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9748 
 9749         for (int i = 0; i < 4; i++) {
 9750           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9751           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9752           __ cmp(tmp1, tmp2);
 9753           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9754           __ br(Assembler::NE, DIFF);
 9755         }
 9756         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9757         __ add(str1, str1, 64);
 9758         __ add(str2, str2, 64);
 9759         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9760         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9761         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9762     }
 9763 
 9764     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9765     __ br(Assembler::LE, LESS16);
 9766     __ align(OptoLoopAlignment);
 9767     __ bind(LOOP_COMPARE16);
 9768       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9769       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9770       __ cmp(tmp1, tmp2);
 9771       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9772       __ br(Assembler::NE, DIFF);
 9773       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9774       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9775       __ br(Assembler::LT, LESS16);
 9776 
 9777       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9778       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9779       __ cmp(tmp1, tmp2);
 9780       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9781       __ br(Assembler::NE, DIFF);
 9782       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9783       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9784       __ br(Assembler::GE, LOOP_COMPARE16);
 9785       __ cbz(cnt2, LENGTH_DIFF);
 9786 
 9787     __ bind(LESS16);
 9788       // each 8 compare
 9789       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9790       __ br(Assembler::LE, LESS8);
 9791       __ ldr(tmp1, Address(__ post(str1, 8)));
 9792       __ ldr(tmp2, Address(__ post(str2, 8)));
 9793       __ eor(rscratch2, tmp1, tmp2);
 9794       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9795       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9796 
 9797     __ bind(LESS8); // directly load last 8 bytes
 9798       if (!isLL) {
 9799         __ add(cnt2, cnt2, cnt2);
 9800       }
 9801       __ ldr(tmp1, Address(str1, cnt2));
 9802       __ ldr(tmp2, Address(str2, cnt2));
 9803       __ eor(rscratch2, tmp1, tmp2);
 9804       __ cbz(rscratch2, LENGTH_DIFF);
 9805       __ b(CAL_DIFFERENCE);
 9806 
 9807     __ bind(DIFF);
 9808       __ cmp(tmp1, tmp2);
 9809       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9810       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9811       // reuse rscratch2 register for the result of eor instruction
 9812       __ eor(rscratch2, tmp1, tmp2);
 9813 
 9814     __ bind(CAL_DIFFERENCE);
 9815       __ rev(rscratch2, rscratch2);
 9816       __ clz(rscratch2, rscratch2);
 9817       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9818       __ lsrv(tmp1, tmp1, rscratch2);
 9819       __ lsrv(tmp2, tmp2, rscratch2);
 9820       if (isLL) {
 9821         __ uxtbw(tmp1, tmp1);
 9822         __ uxtbw(tmp2, tmp2);
 9823       } else {
 9824         __ uxthw(tmp1, tmp1);
 9825         __ uxthw(tmp2, tmp2);
 9826       }
 9827       __ subw(result, tmp1, tmp2);
 9828 
 9829     __ bind(LENGTH_DIFF);
 9830       __ ret(lr);
 9831 
 9832     // record the stub entry and end
 9833     store_archive_data(stub_id, entry, __ pc());
 9834 
 9835     return entry;
 9836   }
 9837 
 9838   enum string_compare_mode {
 9839     LL,
 9840     LU,
 9841     UL,
 9842     UU,
 9843   };
 9844 
 9845   // The following registers are declared in aarch64.ad
 9846   // r0  = result
 9847   // r1  = str1
 9848   // r2  = cnt1
 9849   // r3  = str2
 9850   // r4  = cnt2
 9851   // r10 = tmp1
 9852   // r11 = tmp2
 9853   // z0  = ztmp1
 9854   // z1  = ztmp2
 9855   // p0  = pgtmp1
 9856   // p1  = pgtmp2
 9857   address generate_compare_long_string_sve(string_compare_mode mode) {
 9858     StubId stub_id;
 9859     switch (mode) {
 9860       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9861       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9862       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9863       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9864       default: ShouldNotReachHere();
 9865     }
 9866     int entry_count = StubInfo::entry_count(stub_id);
 9867     assert(entry_count == 1, "sanity check");
 9868     address start = load_archive_data(stub_id);
 9869     if (start != nullptr) {
 9870       return start;
 9871     }
 9872     __ align(CodeEntryAlignment);
 9873     StubCodeMark mark(this, stub_id);
 9874     address entry = __ pc();
 9875     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9876              tmp1 = r10, tmp2 = r11;
 9877 
 9878     Label LOOP, DONE, MISMATCH;
 9879     Register vec_len = tmp1;
 9880     Register idx = tmp2;
 9881     // The minimum of the string lengths has been stored in cnt2.
 9882     Register cnt = cnt2;
 9883     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9884     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9885 
 9886 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9887     switch (mode) {                                                            \
 9888       case LL:                                                                 \
 9889         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9890         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9891         break;                                                                 \
 9892       case LU:                                                                 \
 9893         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9894         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9895         break;                                                                 \
 9896       case UL:                                                                 \
 9897         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9898         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9899         break;                                                                 \
 9900       case UU:                                                                 \
 9901         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9902         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9903         break;                                                                 \
 9904       default:                                                                 \
 9905         ShouldNotReachHere();                                                  \
 9906     }
 9907 
 9908     __ mov(idx, 0);
 9909     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9910 
 9911     if (mode == LL) {
 9912       __ sve_cntb(vec_len);
 9913     } else {
 9914       __ sve_cnth(vec_len);
 9915     }
 9916 
 9917     __ sub(rscratch1, cnt, vec_len);
 9918 
 9919     __ bind(LOOP);
 9920 
 9921       // main loop
 9922       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9923       __ add(idx, idx, vec_len);
 9924       // Compare strings.
 9925       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9926       __ br(__ NE, MISMATCH);
 9927       __ cmp(idx, rscratch1);
 9928       __ br(__ LT, LOOP);
 9929 
 9930     // post loop, last iteration
 9931     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9932 
 9933     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9934     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9935     __ br(__ EQ, DONE);
 9936 
 9937     __ bind(MISMATCH);
 9938 
 9939     // Crop the vector to find its location.
 9940     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9941     // Extract the first different characters of each string.
 9942     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9943     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9944 
 9945     // Compute the difference of the first different characters.
 9946     __ sub(result, rscratch1, rscratch2);
 9947 
 9948     __ bind(DONE);
 9949     __ ret(lr);
 9950 #undef LOAD_PAIR
 9951 
 9952     // record the stub entry and end
 9953     store_archive_data(stub_id, entry, __ pc());
 9954 
 9955     return entry;
 9956   }
 9957 
 9958   void generate_compare_long_strings() {
 9959     if (UseSVE == 0) {
 9960       StubRoutines::aarch64::_compare_long_string_LL
 9961           = generate_compare_long_string_same_encoding(true);
 9962       StubRoutines::aarch64::_compare_long_string_UU
 9963           = generate_compare_long_string_same_encoding(false);
 9964       StubRoutines::aarch64::_compare_long_string_LU
 9965           = generate_compare_long_string_different_encoding(true);
 9966       StubRoutines::aarch64::_compare_long_string_UL
 9967           = generate_compare_long_string_different_encoding(false);
 9968     } else {
 9969       StubRoutines::aarch64::_compare_long_string_LL
 9970           = generate_compare_long_string_sve(LL);
 9971       StubRoutines::aarch64::_compare_long_string_UU
 9972           = generate_compare_long_string_sve(UU);
 9973       StubRoutines::aarch64::_compare_long_string_LU
 9974           = generate_compare_long_string_sve(LU);
 9975       StubRoutines::aarch64::_compare_long_string_UL
 9976           = generate_compare_long_string_sve(UL);
 9977     }
 9978   }
 9979 
 9980   // R0 = result
 9981   // R1 = str2
 9982   // R2 = cnt1
 9983   // R3 = str1
 9984   // R4 = cnt2
 9985   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9986   //
 9987   // This generic linear code use few additional ideas, which makes it faster:
 9988   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9989   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9990   // 2) we can use "fast" algorithm of finding single character to search for
 9991   // first symbol with less branches(1 branch per each loaded register instead
 9992   // of branch for each symbol), so, this is where constants like
 9993   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9994   // 3) after loading and analyzing 1st register of source string, it can be
 9995   // used to search for every 1st character entry, saving few loads in
 9996   // comparison with "simplier-but-slower" implementation
 9997   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9998   // re-using/re-initializing/compressing register values, which makes code
 9999   // larger and a bit less readable, however, most of extra operations are
10000   // issued during loads or branches, so, penalty is minimal
10001   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
10002     StubId stub_id;
10003     if (str1_isL) {
10004       if (str2_isL) {
10005         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
10006       } else {
10007         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
10008       }
10009     } else {
10010       if (str2_isL) {
10011         ShouldNotReachHere();
10012       } else {
10013         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
10014       }
10015     }
10016     int entry_count = StubInfo::entry_count(stub_id);
10017     assert(entry_count == 1, "sanity check");
10018     address start = load_archive_data(stub_id);
10019     if (start != nullptr) {
10020       return start;
10021     }
10022     __ align(CodeEntryAlignment);
10023     StubCodeMark mark(this, stub_id);
10024     address entry = __ pc();
10025 
10026     int str1_chr_size = str1_isL ? 1 : 2;
10027     int str2_chr_size = str2_isL ? 1 : 2;
10028     int str1_chr_shift = str1_isL ? 0 : 1;
10029     int str2_chr_shift = str2_isL ? 0 : 1;
10030     bool isL = str1_isL && str2_isL;
10031    // parameters
10032     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
10033     // temporary registers
10034     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
10035     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
10036     // redefinitions
10037     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
10038 
10039     __ push(spilled_regs, sp);
10040     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
10041         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
10042         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
10043         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
10044         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
10045         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
10046     // Read whole register from str1. It is safe, because length >=8 here
10047     __ ldr(ch1, Address(str1));
10048     // Read whole register from str2. It is safe, because length >=8 here
10049     __ ldr(ch2, Address(str2));
10050     __ sub(cnt2, cnt2, cnt1);
10051     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
10052     if (str1_isL != str2_isL) {
10053       __ eor(v0, __ T16B, v0, v0);
10054     }
10055     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
10056     __ mul(first, first, tmp1);
10057     // check if we have less than 1 register to check
10058     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
10059     if (str1_isL != str2_isL) {
10060       __ fmovd(v1, ch1);
10061     }
10062     __ br(__ LE, L_SMALL);
10063     __ eor(ch2, first, ch2);
10064     if (str1_isL != str2_isL) {
10065       __ zip1(v1, __ T16B, v1, v0);
10066     }
10067     __ sub(tmp2, ch2, tmp1);
10068     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10069     __ bics(tmp2, tmp2, ch2);
10070     if (str1_isL != str2_isL) {
10071       __ fmovd(ch1, v1);
10072     }
10073     __ br(__ NE, L_HAS_ZERO);
10074     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10075     __ add(result, result, wordSize/str2_chr_size);
10076     __ add(str2, str2, wordSize);
10077     __ br(__ LT, L_POST_LOOP);
10078     __ BIND(L_LOOP);
10079       __ ldr(ch2, Address(str2));
10080       __ eor(ch2, first, ch2);
10081       __ sub(tmp2, ch2, tmp1);
10082       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10083       __ bics(tmp2, tmp2, ch2);
10084       __ br(__ NE, L_HAS_ZERO);
10085     __ BIND(L_LOOP_PROCEED);
10086       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10087       __ add(str2, str2, wordSize);
10088       __ add(result, result, wordSize/str2_chr_size);
10089       __ br(__ GE, L_LOOP);
10090     __ BIND(L_POST_LOOP);
10091       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
10092       __ br(__ LE, NOMATCH);
10093       __ ldr(ch2, Address(str2));
10094       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10095       __ eor(ch2, first, ch2);
10096       __ sub(tmp2, ch2, tmp1);
10097       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10098       __ mov(tmp4, -1); // all bits set
10099       __ b(L_SMALL_PROCEED);
10100     __ align(OptoLoopAlignment);
10101     __ BIND(L_SMALL);
10102       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10103       __ eor(ch2, first, ch2);
10104       if (str1_isL != str2_isL) {
10105         __ zip1(v1, __ T16B, v1, v0);
10106       }
10107       __ sub(tmp2, ch2, tmp1);
10108       __ mov(tmp4, -1); // all bits set
10109       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10110       if (str1_isL != str2_isL) {
10111         __ fmovd(ch1, v1); // move converted 4 symbols
10112       }
10113     __ BIND(L_SMALL_PROCEED);
10114       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
10115       __ bic(tmp2, tmp2, ch2);
10116       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
10117       __ rbit(tmp2, tmp2);
10118       __ br(__ EQ, NOMATCH);
10119     __ BIND(L_SMALL_HAS_ZERO_LOOP);
10120       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
10121       __ cmp(cnt1, u1(wordSize/str2_chr_size));
10122       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
10123       if (str2_isL) { // LL
10124         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10125         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10126         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10127         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10128         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10129       } else {
10130         __ mov(ch2, 0xE); // all bits in byte set except last one
10131         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10132         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10133         __ lslv(tmp2, tmp2, tmp4);
10134         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10135         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10136         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10137         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10138       }
10139       __ cmp(ch1, ch2);
10140       __ mov(tmp4, wordSize/str2_chr_size);
10141       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10142     __ BIND(L_SMALL_CMP_LOOP);
10143       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10144                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10145       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10146                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10147       __ add(tmp4, tmp4, 1);
10148       __ cmp(tmp4, cnt1);
10149       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
10150       __ cmp(first, ch2);
10151       __ br(__ EQ, L_SMALL_CMP_LOOP);
10152     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
10153       __ cbz(tmp2, NOMATCH); // no more matches. exit
10154       __ clz(tmp4, tmp2);
10155       __ add(result, result, 1); // advance index
10156       __ add(str2, str2, str2_chr_size); // advance pointer
10157       __ b(L_SMALL_HAS_ZERO_LOOP);
10158     __ align(OptoLoopAlignment);
10159     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
10160       __ cmp(first, ch2);
10161       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10162       __ b(DONE);
10163     __ align(OptoLoopAlignment);
10164     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
10165       if (str2_isL) { // LL
10166         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10167         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10168         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10169         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10170         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10171       } else {
10172         __ mov(ch2, 0xE); // all bits in byte set except last one
10173         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10174         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10175         __ lslv(tmp2, tmp2, tmp4);
10176         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10177         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10178         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10179         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10180       }
10181       __ cmp(ch1, ch2);
10182       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10183       __ b(DONE);
10184     __ align(OptoLoopAlignment);
10185     __ BIND(L_HAS_ZERO);
10186       __ rbit(tmp2, tmp2);
10187       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
10188       // Now, perform compression of counters(cnt2 and cnt1) into one register.
10189       // It's fine because both counters are 32bit and are not changed in this
10190       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
10191       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
10192       __ sub(result, result, 1);
10193     __ BIND(L_HAS_ZERO_LOOP);
10194       __ mov(cnt1, wordSize/str2_chr_size);
10195       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10196       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
10197       if (str2_isL) {
10198         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10199         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10200         __ lslv(tmp2, tmp2, tmp4);
10201         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10202         __ add(tmp4, tmp4, 1);
10203         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10204         __ lsl(tmp2, tmp2, 1);
10205         __ mov(tmp4, wordSize/str2_chr_size);
10206       } else {
10207         __ mov(ch2, 0xE);
10208         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10209         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10210         __ lslv(tmp2, tmp2, tmp4);
10211         __ add(tmp4, tmp4, 1);
10212         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10213         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10214         __ lsl(tmp2, tmp2, 1);
10215         __ mov(tmp4, wordSize/str2_chr_size);
10216         __ sub(str2, str2, str2_chr_size);
10217       }
10218       __ cmp(ch1, ch2);
10219       __ mov(tmp4, wordSize/str2_chr_size);
10220       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10221     __ BIND(L_CMP_LOOP);
10222       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10223                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10224       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10225                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10226       __ add(tmp4, tmp4, 1);
10227       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10228       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
10229       __ cmp(cnt1, ch2);
10230       __ br(__ EQ, L_CMP_LOOP);
10231     __ BIND(L_CMP_LOOP_NOMATCH);
10232       // here we're not matched
10233       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
10234       __ clz(tmp4, tmp2);
10235       __ add(str2, str2, str2_chr_size); // advance pointer
10236       __ b(L_HAS_ZERO_LOOP);
10237     __ align(OptoLoopAlignment);
10238     __ BIND(L_CMP_LOOP_LAST_CMP);
10239       __ cmp(cnt1, ch2);
10240       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10241       __ b(DONE);
10242     __ align(OptoLoopAlignment);
10243     __ BIND(L_CMP_LOOP_LAST_CMP2);
10244       if (str2_isL) {
10245         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10246         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10247         __ lslv(tmp2, tmp2, tmp4);
10248         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10249         __ add(tmp4, tmp4, 1);
10250         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10251         __ lsl(tmp2, tmp2, 1);
10252       } else {
10253         __ mov(ch2, 0xE);
10254         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10255         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10256         __ lslv(tmp2, tmp2, tmp4);
10257         __ add(tmp4, tmp4, 1);
10258         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10259         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10260         __ lsl(tmp2, tmp2, 1);
10261         __ sub(str2, str2, str2_chr_size);
10262       }
10263       __ cmp(ch1, ch2);
10264       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10265       __ b(DONE);
10266     __ align(OptoLoopAlignment);
10267     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
10268       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
10269       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
10270       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
10271       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
10272       // result by analyzed characters value, so, we can just reset lower bits
10273       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
10274       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
10275       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
10276       // index of last analyzed substring inside current octet. So, str2 in at
10277       // respective start address. We need to advance it to next octet
10278       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
10279       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
10280       __ bfm(result, zr, 0, 2 - str2_chr_shift);
10281       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
10282       __ movw(cnt2, cnt2);
10283       __ b(L_LOOP_PROCEED);
10284     __ align(OptoLoopAlignment);
10285     __ BIND(NOMATCH);
10286       __ mov(result, -1);
10287     __ BIND(DONE);
10288       __ pop(spilled_regs, sp);
10289       __ ret(lr);
10290 
10291     // record the stub entry and end
10292     store_archive_data(stub_id, entry, __ pc());
10293 
10294     return entry;
10295   }
10296 
10297   void generate_string_indexof_stubs() {
10298     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
10299     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
10300     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
10301   }
10302 
10303   void inflate_and_store_2_fp_registers(bool generatePrfm,
10304       FloatRegister src1, FloatRegister src2) {
10305     Register dst = r1;
10306     __ zip1(v1, __ T16B, src1, v0);
10307     __ zip2(v2, __ T16B, src1, v0);
10308     if (generatePrfm) {
10309       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
10310     }
10311     __ zip1(v3, __ T16B, src2, v0);
10312     __ zip2(v4, __ T16B, src2, v0);
10313     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
10314   }
10315 
10316   // R0 = src
10317   // R1 = dst
10318   // R2 = len
10319   // R3 = len >> 3
10320   // V0 = 0
10321   // v1 = loaded 8 bytes
10322   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
10323   address generate_large_byte_array_inflate() {
10324     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
10325     int entry_count = StubInfo::entry_count(stub_id);
10326     assert(entry_count == 1, "sanity check");
10327     address start = load_archive_data(stub_id);
10328     if (start != nullptr) {
10329       return start;
10330     }
10331     __ align(CodeEntryAlignment);
10332     StubCodeMark mark(this, stub_id);
10333     address entry = __ pc();
10334     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
10335     Register src = r0, dst = r1, len = r2, octetCounter = r3;
10336     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
10337 
10338     // do one more 8-byte read to have address 16-byte aligned in most cases
10339     // also use single store instruction
10340     __ ldrd(v2, __ post(src, 8));
10341     __ sub(octetCounter, octetCounter, 2);
10342     __ zip1(v1, __ T16B, v1, v0);
10343     __ zip1(v2, __ T16B, v2, v0);
10344     __ st1(v1, v2, __ T16B, __ post(dst, 32));
10345     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10346     __ subs(rscratch1, octetCounter, large_loop_threshold);
10347     __ br(__ LE, LOOP_START);
10348     __ b(LOOP_PRFM_START);
10349     __ bind(LOOP_PRFM);
10350       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10351     __ bind(LOOP_PRFM_START);
10352       __ prfm(Address(src, SoftwarePrefetchHintDistance));
10353       __ sub(octetCounter, octetCounter, 8);
10354       __ subs(rscratch1, octetCounter, large_loop_threshold);
10355       inflate_and_store_2_fp_registers(true, v3, v4);
10356       inflate_and_store_2_fp_registers(true, v5, v6);
10357       __ br(__ GT, LOOP_PRFM);
10358       __ cmp(octetCounter, (u1)8);
10359       __ br(__ LT, DONE);
10360     __ bind(LOOP);
10361       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10362       __ bind(LOOP_START);
10363       __ sub(octetCounter, octetCounter, 8);
10364       __ cmp(octetCounter, (u1)8);
10365       inflate_and_store_2_fp_registers(false, v3, v4);
10366       inflate_and_store_2_fp_registers(false, v5, v6);
10367       __ br(__ GE, LOOP);
10368     __ bind(DONE);
10369       __ ret(lr);
10370 
10371     // record the stub entry and end
10372     store_archive_data(stub_id, entry, __ pc());
10373 
10374     return entry;
10375   }
10376 
10377   /**
10378    *  Arguments:
10379    *
10380    *  Input:
10381    *  c_rarg0   - current state address
10382    *  c_rarg1   - H key address
10383    *  c_rarg2   - data address
10384    *  c_rarg3   - number of blocks
10385    *
10386    *  Output:
10387    *  Updated state at c_rarg0
10388    */
10389   address generate_ghash_processBlocks_small() {
10390     // Bafflingly, GCM uses little-endian for the byte order, but
10391     // big-endian for the bit order.  For example, the polynomial 1 is
10392     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
10393     //
10394     // So, we must either reverse the bytes in each word and do
10395     // everything big-endian or reverse the bits in each byte and do
10396     // it little-endian.  On AArch64 it's more idiomatic to reverse
10397     // the bits in each byte (we have an instruction, RBIT, to do
10398     // that) and keep the data in little-endian bit order through the
10399     // calculation, bit-reversing the inputs and outputs.
10400 
10401     StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
10402     int entry_count = StubInfo::entry_count(stub_id);
10403     assert(entry_count == 1, "sanity check");
10404     address start = load_archive_data(stub_id);
10405     if (start != nullptr) {
10406       return start;
10407     }
10408     __ align(CodeEntryAlignment);
10409     StubCodeMark mark(this, stub_id);
10410     Label polynomial; // local data generated at end of stub
10411     start = __ pc();
10412 
10413     Register state   = c_rarg0;
10414     Register subkeyH = c_rarg1;
10415     Register data    = c_rarg2;
10416     Register blocks  = c_rarg3;
10417 
10418     FloatRegister vzr = v30;
10419     __ eor(vzr, __ T16B, vzr, vzr); // zero register
10420 
10421     __ adr(rscratch1, polynomial);
10422     __ ldrq(v24, rscratch1);    // The field polynomial
10423 
10424     __ ldrq(v0, Address(state));
10425     __ ldrq(v1, Address(subkeyH));
10426 
10427     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
10428     __ rbit(v0, __ T16B, v0);
10429     __ rev64(v1, __ T16B, v1);
10430     __ rbit(v1, __ T16B, v1);
10431 
10432     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
10433     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
10434 
10435     {
10436       Label L_ghash_loop;
10437       __ bind(L_ghash_loop);
10438 
10439       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
10440                                                  // reversing each byte
10441       __ rbit(v2, __ T16B, v2);
10442       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
10443 
10444       // Multiply state in v2 by subkey in v1
10445       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
10446                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
10447                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
10448       // Reduce v7:v5 by the field polynomial
10449       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
10450 
10451       __ sub(blocks, blocks, 1);
10452       __ cbnz(blocks, L_ghash_loop);
10453     }
10454 
10455     // The bit-reversed result is at this point in v0
10456     __ rev64(v0, __ T16B, v0);
10457     __ rbit(v0, __ T16B, v0);
10458 
10459     __ st1(v0, __ T16B, state);
10460     __ ret(lr);
10461 
10462     // bind label and generate local polynomial data
10463     __ align(wordSize * 2);
10464     __ bind(polynomial);
10465     __ emit_int64(0x87);  // The low-order bits of the field
10466                           // polynomial (i.e. p = z^7+z^2+z+1)
10467                           // repeated in the low and high parts of a
10468                           // 128-bit vector
10469     __ emit_int64(0x87);
10470 
10471     // record the stub entry and end
10472     store_archive_data(stub_id, start, __ pc());
10473 
10474     return start;
10475   }
10476 
10477   address generate_ghash_processBlocks(address small) {
10478     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
10479     int entry_count = StubInfo::entry_count(stub_id);
10480     assert(entry_count == 1, "sanity check");
10481     address start = load_archive_data(stub_id);
10482     if (start != nullptr) {
10483       return start;
10484     }
10485     Label polynomial;           // local data generated after stub
10486     __ align(CodeEntryAlignment);
10487     StubCodeMark mark(this, stub_id);
10488     start = __ pc();
10489 
10490     Register state   = c_rarg0;
10491     Register subkeyH = c_rarg1;
10492     Register data    = c_rarg2;
10493     Register blocks  = c_rarg3;
10494 
10495     const int unroll = 4;
10496 
10497     __ cmp(blocks, (unsigned char)(unroll * 2));
10498     __ br(__ LT, small);
10499 
10500     if (unroll > 1) {
10501     // Save state before entering routine
10502       __ sub(sp, sp, 4 * 16);
10503       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
10504       __ sub(sp, sp, 4 * 16);
10505       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
10506     }
10507 
10508     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
10509 
10510     if (unroll > 1) {
10511       // And restore state
10512       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
10513       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
10514     }
10515 
10516     __ cmp(blocks, (unsigned char)0);
10517     __ br(__ GT, small);
10518 
10519     __ ret(lr);
10520 
10521     // bind label and generate polynomial data
10522     __ align(wordSize * 2);
10523     __ bind(polynomial);
10524     __ emit_int64(0x87);  // The low-order bits of the field
10525                           // polynomial (i.e. p = z^7+z^2+z+1)
10526                           // repeated in the low and high parts of a
10527                           // 128-bit vector
10528     __ emit_int64(0x87);
10529 
10530     // record the stub entry and end
10531     store_archive_data(stub_id, start, __ pc());
10532 
10533     return start;
10534   }
10535 
10536   void generate_base64_encode_simdround(Register src, Register dst,
10537         FloatRegister codec, u8 size) {
10538 
10539     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
10540     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
10541     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
10542 
10543     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10544 
10545     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
10546 
10547     __ ushr(ind0, arrangement, in0,  2);
10548 
10549     __ ushr(ind1, arrangement, in1,  2);
10550     __ shl(in0,   arrangement, in0,  6);
10551     __ orr(ind1,  arrangement, ind1, in0);
10552     __ ushr(ind1, arrangement, ind1, 2);
10553 
10554     __ ushr(ind2, arrangement, in2,  4);
10555     __ shl(in1,   arrangement, in1,  4);
10556     __ orr(ind2,  arrangement, in1,  ind2);
10557     __ ushr(ind2, arrangement, ind2, 2);
10558 
10559     __ shl(ind3,  arrangement, in2,  2);
10560     __ ushr(ind3, arrangement, ind3, 2);
10561 
10562     __ tbl(out0,  arrangement, codec,  4, ind0);
10563     __ tbl(out1,  arrangement, codec,  4, ind1);
10564     __ tbl(out2,  arrangement, codec,  4, ind2);
10565     __ tbl(out3,  arrangement, codec,  4, ind3);
10566 
10567     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
10568   }
10569 
10570    /**
10571    *  Arguments:
10572    *
10573    *  Input:
10574    *  c_rarg0   - src_start
10575    *  c_rarg1   - src_offset
10576    *  c_rarg2   - src_length
10577    *  c_rarg3   - dest_start
10578    *  c_rarg4   - dest_offset
10579    *  c_rarg5   - isURL
10580    *
10581    */
10582   address generate_base64_encodeBlock() {
10583 
10584     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
10585     int entry_count = StubInfo::entry_count(stub_id);
10586     assert(entry_count == 1, "sanity check");
10587     address start = load_archive_data(stub_id);
10588     if (start != nullptr) {
10589       return start;
10590     }
10591     __ align(CodeEntryAlignment);
10592     StubCodeMark mark(this, stub_id);
10593     start = __ pc();
10594 
10595     Register src   = c_rarg0;  // source array
10596     Register soff  = c_rarg1;  // source start offset
10597     Register send  = c_rarg2;  // source end offset
10598     Register dst   = c_rarg3;  // dest array
10599     Register doff  = c_rarg4;  // position for writing to dest array
10600     Register isURL = c_rarg5;  // Base64 or URL character set
10601 
10602     // c_rarg6 and c_rarg7 are free to use as temps
10603     Register codec  = c_rarg6;
10604     Register length = c_rarg7;
10605 
10606     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
10607 
10608     __ add(src, src, soff);
10609     __ add(dst, dst, doff);
10610     __ sub(length, send, soff);
10611 
10612     // load the codec base address
10613     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
10614     __ cbz(isURL, ProcessData);
10615     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
10616 
10617     __ BIND(ProcessData);
10618 
10619     // too short to formup a SIMD loop, roll back
10620     __ cmp(length, (u1)24);
10621     __ br(Assembler::LT, Process3B);
10622 
10623     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
10624 
10625     __ BIND(Process48B);
10626     __ cmp(length, (u1)48);
10627     __ br(Assembler::LT, Process24B);
10628     generate_base64_encode_simdround(src, dst, v0, 16);
10629     __ sub(length, length, 48);
10630     __ b(Process48B);
10631 
10632     __ BIND(Process24B);
10633     __ cmp(length, (u1)24);
10634     __ br(Assembler::LT, SIMDExit);
10635     generate_base64_encode_simdround(src, dst, v0, 8);
10636     __ sub(length, length, 24);
10637 
10638     __ BIND(SIMDExit);
10639     __ cbz(length, Exit);
10640 
10641     __ BIND(Process3B);
10642     //  3 src bytes, 24 bits
10643     __ ldrb(r10, __ post(src, 1));
10644     __ ldrb(r11, __ post(src, 1));
10645     __ ldrb(r12, __ post(src, 1));
10646     __ orrw(r11, r11, r10, Assembler::LSL, 8);
10647     __ orrw(r12, r12, r11, Assembler::LSL, 8);
10648     // codec index
10649     __ ubfmw(r15, r12, 18, 23);
10650     __ ubfmw(r14, r12, 12, 17);
10651     __ ubfmw(r13, r12, 6,  11);
10652     __ andw(r12,  r12, 63);
10653     // get the code based on the codec
10654     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
10655     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
10656     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
10657     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
10658     __ strb(r15, __ post(dst, 1));
10659     __ strb(r14, __ post(dst, 1));
10660     __ strb(r13, __ post(dst, 1));
10661     __ strb(r12, __ post(dst, 1));
10662     __ sub(length, length, 3);
10663     __ cbnz(length, Process3B);
10664 
10665     __ BIND(Exit);
10666     __ ret(lr);
10667 
10668     // record the stub entry and end
10669     store_archive_data(stub_id, start, __ pc());
10670 
10671     return start;
10672   }
10673 
10674   void generate_base64_decode_simdround(Register src, Register dst,
10675         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
10676 
10677     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
10678     FloatRegister out0 = v20, out1 = v21, out2 = v22;
10679 
10680     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
10681     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
10682 
10683     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
10684 
10685     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10686 
10687     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10688 
10689     // we need unsigned saturating subtract, to make sure all input values
10690     // in range [0, 63] will have 0U value in the higher half lookup
10691     __ uqsubv(decH0, __ T16B, in0, v27);
10692     __ uqsubv(decH1, __ T16B, in1, v27);
10693     __ uqsubv(decH2, __ T16B, in2, v27);
10694     __ uqsubv(decH3, __ T16B, in3, v27);
10695 
10696     // lower half lookup
10697     __ tbl(decL0, arrangement, codecL, 4, in0);
10698     __ tbl(decL1, arrangement, codecL, 4, in1);
10699     __ tbl(decL2, arrangement, codecL, 4, in2);
10700     __ tbl(decL3, arrangement, codecL, 4, in3);
10701 
10702     // higher half lookup
10703     __ tbx(decH0, arrangement, codecH, 4, decH0);
10704     __ tbx(decH1, arrangement, codecH, 4, decH1);
10705     __ tbx(decH2, arrangement, codecH, 4, decH2);
10706     __ tbx(decH3, arrangement, codecH, 4, decH3);
10707 
10708     // combine lower and higher
10709     __ orr(decL0, arrangement, decL0, decH0);
10710     __ orr(decL1, arrangement, decL1, decH1);
10711     __ orr(decL2, arrangement, decL2, decH2);
10712     __ orr(decL3, arrangement, decL3, decH3);
10713 
10714     // check illegal inputs, value larger than 63 (maximum of 6 bits)
10715     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10716     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10717     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10718     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10719     __ orr(in0, arrangement, decH0, decH1);
10720     __ orr(in1, arrangement, decH2, decH3);
10721     __ orr(in2, arrangement, in0,   in1);
10722     __ umaxv(in3, arrangement, in2);
10723     __ umov(rscratch2, in3, __ B, 0);
10724 
10725     // get the data to output
10726     __ shl(out0,  arrangement, decL0, 2);
10727     __ ushr(out1, arrangement, decL1, 4);
10728     __ orr(out0,  arrangement, out0,  out1);
10729     __ shl(out1,  arrangement, decL1, 4);
10730     __ ushr(out2, arrangement, decL2, 2);
10731     __ orr(out1,  arrangement, out1,  out2);
10732     __ shl(out2,  arrangement, decL2, 6);
10733     __ orr(out2,  arrangement, out2,  decL3);
10734 
10735     __ cbz(rscratch2, NoIllegalData);
10736 
10737     // handle illegal input
10738     __ umov(r10, in2, __ D, 0);
10739     if (size == 16) {
10740       __ cbnz(r10, ErrorInLowerHalf);
10741 
10742       // illegal input is in higher half, store the lower half now.
10743       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10744 
10745       __ umov(r10, in2,  __ D, 1);
10746       __ umov(r11, out0, __ D, 1);
10747       __ umov(r12, out1, __ D, 1);
10748       __ umov(r13, out2, __ D, 1);
10749       __ b(StoreLegalData);
10750 
10751       __ BIND(ErrorInLowerHalf);
10752     }
10753     __ umov(r11, out0, __ D, 0);
10754     __ umov(r12, out1, __ D, 0);
10755     __ umov(r13, out2, __ D, 0);
10756 
10757     __ BIND(StoreLegalData);
10758     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10759     __ strb(r11, __ post(dst, 1));
10760     __ strb(r12, __ post(dst, 1));
10761     __ strb(r13, __ post(dst, 1));
10762     __ lsr(r10, r10, 8);
10763     __ lsr(r11, r11, 8);
10764     __ lsr(r12, r12, 8);
10765     __ lsr(r13, r13, 8);
10766     __ b(StoreLegalData);
10767 
10768     __ BIND(NoIllegalData);
10769     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10770   }
10771 
10772 
10773    /**
10774    *  Arguments:
10775    *
10776    *  Input:
10777    *  c_rarg0   - src_start
10778    *  c_rarg1   - src_offset
10779    *  c_rarg2   - src_length
10780    *  c_rarg3   - dest_start
10781    *  c_rarg4   - dest_offset
10782    *  c_rarg5   - isURL
10783    *  c_rarg6   - isMIME
10784    *
10785    */
10786   address generate_base64_decodeBlock() {
10787 
10788     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10789     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10790     // titled "Base64 decoding".
10791 
10792     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10793     int entry_count = StubInfo::entry_count(stub_id);
10794     assert(entry_count == 1, "sanity check");
10795     address start = load_archive_data(stub_id);
10796     if (start != nullptr) {
10797       return start;
10798     }
10799     __ align(CodeEntryAlignment);
10800     StubCodeMark mark(this, stub_id);
10801     start = __ pc();
10802 
10803     Register src    = c_rarg0;  // source array
10804     Register soff   = c_rarg1;  // source start offset
10805     Register send   = c_rarg2;  // source end offset
10806     Register dst    = c_rarg3;  // dest array
10807     Register doff   = c_rarg4;  // position for writing to dest array
10808     Register isURL  = c_rarg5;  // Base64 or URL character set
10809     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10810 
10811     Register length = send;    // reuse send as length of source data to process
10812 
10813     Register simd_codec   = c_rarg6;
10814     Register nosimd_codec = c_rarg7;
10815 
10816     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10817 
10818     __ enter();
10819 
10820     __ add(src, src, soff);
10821     __ add(dst, dst, doff);
10822 
10823     __ mov(doff, dst);
10824 
10825     __ sub(length, send, soff);
10826     __ bfm(length, zr, 0, 1);
10827 
10828     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
10829     __ cbz(isURL, ProcessData);
10830     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
10831 
10832     __ BIND(ProcessData);
10833     __ mov(rscratch1, length);
10834     __ cmp(length, (u1)144); // 144 = 80 + 64
10835     __ br(Assembler::LT, Process4B);
10836 
10837     // In the MIME case, the line length cannot be more than 76
10838     // bytes (see RFC 2045). This is too short a block for SIMD
10839     // to be worthwhile, so we use non-SIMD here.
10840     __ movw(rscratch1, 79);
10841 
10842     __ BIND(Process4B);
10843     __ ldrw(r14, __ post(src, 4));
10844     __ ubfxw(r10, r14, 0,  8);
10845     __ ubfxw(r11, r14, 8,  8);
10846     __ ubfxw(r12, r14, 16, 8);
10847     __ ubfxw(r13, r14, 24, 8);
10848     // get the de-code
10849     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10850     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10851     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10852     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10853     // error detection, 255u indicates an illegal input
10854     __ orrw(r14, r10, r11);
10855     __ orrw(r15, r12, r13);
10856     __ orrw(r14, r14, r15);
10857     __ tbnz(r14, 7, Exit);
10858     // recover the data
10859     __ lslw(r14, r10, 10);
10860     __ bfiw(r14, r11, 4, 6);
10861     __ bfmw(r14, r12, 2, 5);
10862     __ rev16w(r14, r14);
10863     __ bfiw(r13, r12, 6, 2);
10864     __ strh(r14, __ post(dst, 2));
10865     __ strb(r13, __ post(dst, 1));
10866     // non-simd loop
10867     __ subsw(rscratch1, rscratch1, 4);
10868     __ br(Assembler::GT, Process4B);
10869 
10870     // if exiting from PreProcess80B, rscratch1 == -1;
10871     // otherwise, rscratch1 == 0.
10872     __ cbzw(rscratch1, Exit);
10873     __ sub(length, length, 80);
10874 
10875     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
10876     __ cbz(isURL, SIMDEnter);
10877     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
10878 
10879     __ BIND(SIMDEnter);
10880     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10881     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10882     __ mov(rscratch1, 63);
10883     __ dup(v27, __ T16B, rscratch1);
10884 
10885     __ BIND(Process64B);
10886     __ cmp(length, (u1)64);
10887     __ br(Assembler::LT, Process32B);
10888     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10889     __ sub(length, length, 64);
10890     __ b(Process64B);
10891 
10892     __ BIND(Process32B);
10893     __ cmp(length, (u1)32);
10894     __ br(Assembler::LT, SIMDExit);
10895     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10896     __ sub(length, length, 32);
10897     __ b(Process32B);
10898 
10899     __ BIND(SIMDExit);
10900     __ cbz(length, Exit);
10901     __ movw(rscratch1, length);
10902     __ b(Process4B);
10903 
10904     __ BIND(Exit);
10905     __ sub(c_rarg0, dst, doff);
10906 
10907     __ leave();
10908     __ ret(lr);
10909 
10910     // record the stub entry and end
10911     store_archive_data(stub_id, start, __ pc());
10912 
10913     return start;
10914   }
10915 
10916   // Support for spin waits.
10917   address generate_spin_wait() {
10918     StubId stub_id = StubId::stubgen_spin_wait_id;
10919     int entry_count = StubInfo::entry_count(stub_id);
10920     assert(entry_count == 1, "sanity check");
10921     address start = load_archive_data(stub_id);
10922     if (start != nullptr) {
10923       return start;
10924     }
10925     __ align(CodeEntryAlignment);
10926     StubCodeMark mark(this, stub_id);
10927     start = __ pc();
10928 
10929     __ spin_wait();
10930     __ ret(lr);
10931 
10932     // record the stub entry and end
10933     store_archive_data(stub_id, start, __ pc());
10934 
10935     return start;
10936   }
10937 
10938   void generate_lookup_secondary_supers_table_stub() {
10939     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10940     GrowableArray<address> entries;
10941     int entry_count = StubInfo::entry_count(stub_id);
10942     assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
10943     address start = load_archive_data(stub_id, &entries);
10944     if (start != nullptr) {
10945       assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
10946              "unexpected extra entry count %d", entries.length());
10947       StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
10948       for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10949         StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
10950       }
10951       return;
10952     }
10953 
10954     StubCodeMark mark(this, stub_id);
10955 
10956     const Register
10957       r_super_klass  = r0,
10958       r_array_base   = r1,
10959       r_array_length = r2,
10960       r_array_index  = r3,
10961       r_sub_klass    = r4,
10962       r_bitmap       = rscratch2,
10963       result         = r5;
10964     const FloatRegister
10965       vtemp          = v0;
10966 
10967     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10968       address next_entry = __ pc();
10969       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
10970       if (slot == 0) {
10971         start = next_entry;
10972       } else {
10973         entries.append(next_entry);
10974       }
10975       Label L_success;
10976       __ enter();
10977       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10978                                              r_array_base, r_array_length, r_array_index,
10979                                              vtemp, result, slot,
10980                                              /*stub_is_near*/true);
10981       __ leave();
10982       __ ret(lr);
10983     }
10984     // record the stub entry and end plus all the auxiliary entries
10985     store_archive_data(stub_id, start, __ pc(), &entries);
10986   }
10987 
10988   // Slow path implementation for UseSecondarySupersTable.
10989   address generate_lookup_secondary_supers_table_slow_path_stub() {
10990     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10991     int entry_count = StubInfo::entry_count(stub_id);
10992     assert(entry_count == 1, "sanity check");
10993     address start = load_archive_data(stub_id);
10994     if (start != nullptr) {
10995       return start;
10996     }
10997     StubCodeMark mark(this, stub_id);
10998     start = __ pc();
10999     const Register
11000       r_super_klass  = r0,        // argument
11001       r_array_base   = r1,        // argument
11002       temp1          = r2,        // temp
11003       r_array_index  = r3,        // argument
11004       r_bitmap       = rscratch2, // argument
11005       result         = r5;        // argument
11006 
11007     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
11008     __ ret(lr);
11009 
11010     // record the stub entry and end
11011     store_archive_data(stub_id, start, __ pc());
11012 
11013     return start;
11014   }
11015 
11016 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11017 
11018   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
11019   //
11020   // If LSE is in use, generate LSE versions of all the stubs. The
11021   // non-LSE versions are in atomic_aarch64.S.
11022 
11023   // class AtomicStubMark records the entry point of a stub and the
11024   // stub pointer which will point to it. The stub pointer is set to
11025   // the entry point when ~AtomicStubMark() is called, which must be
11026   // after ICache::invalidate_range. This ensures safe publication of
11027   // the generated code.
11028   class AtomicStubMark {
11029     address _entry_point;
11030     aarch64_atomic_stub_t *_stub;
11031     MacroAssembler *_masm;
11032   public:
11033     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
11034       _masm = masm;
11035       __ align(32);
11036       _entry_point = __ pc();
11037       _stub = stub;
11038     }
11039     ~AtomicStubMark() {
11040       *_stub = (aarch64_atomic_stub_t)_entry_point;
11041     }
11042   };
11043 
11044   // NB: For memory_order_conservative we need a trailing membar after
11045   // LSE atomic operations but not a leading membar.
11046   //
11047   // We don't need a leading membar because a clause in the Arm ARM
11048   // says:
11049   //
11050   //   Barrier-ordered-before
11051   //
11052   //   Barrier instructions order prior Memory effects before subsequent
11053   //   Memory effects generated by the same Observer. A read or a write
11054   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
11055   //   Observer if and only if RW1 appears in program order before RW 2
11056   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
11057   //   instruction with both Acquire and Release semantics.
11058   //
11059   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
11060   // and Release semantics, therefore we don't need a leading
11061   // barrier. However, there is no corresponding Barrier-ordered-after
11062   // relationship, therefore we need a trailing membar to prevent a
11063   // later store or load from being reordered with the store in an
11064   // atomic instruction.
11065   //
11066   // This was checked by using the herd7 consistency model simulator
11067   // (http://diy.inria.fr/) with this test case:
11068   //
11069   // AArch64 LseCas
11070   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
11071   // P0 | P1;
11072   // LDR W4, [X2] | MOV W3, #0;
11073   // DMB LD       | MOV W4, #1;
11074   // LDR W3, [X1] | CASAL W3, W4, [X1];
11075   //              | DMB ISH;
11076   //              | STR W4, [X2];
11077   // exists
11078   // (0:X3=0 /\ 0:X4=1)
11079   //
11080   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
11081   // with the store to x in P1. Without the DMB in P1 this may happen.
11082   //
11083   // At the time of writing we don't know of any AArch64 hardware that
11084   // reorders stores in this way, but the Reference Manual permits it.
11085 
11086   void gen_cas_entry(Assembler::operand_size size,
11087                      atomic_memory_order order) {
11088     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
11089       exchange_val = c_rarg2;
11090     bool acquire, release;
11091     switch (order) {
11092       case memory_order_relaxed:
11093         acquire = false;
11094         release = false;
11095         break;
11096       case memory_order_release:
11097         acquire = false;
11098         release = true;
11099         break;
11100       default:
11101         acquire = true;
11102         release = true;
11103         break;
11104     }
11105     __ mov(prev, compare_val);
11106     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
11107     if (order == memory_order_conservative) {
11108       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11109     }
11110     if (size == Assembler::xword) {
11111       __ mov(r0, prev);
11112     } else {
11113       __ movw(r0, prev);
11114     }
11115     __ ret(lr);
11116   }
11117 
11118   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
11119     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11120     // If not relaxed, then default to conservative.  Relaxed is the only
11121     // case we use enough to be worth specializing.
11122     if (order == memory_order_relaxed) {
11123       __ ldadd(size, incr, prev, addr);
11124     } else {
11125       __ ldaddal(size, incr, prev, addr);
11126       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11127     }
11128     if (size == Assembler::xword) {
11129       __ mov(r0, prev);
11130     } else {
11131       __ movw(r0, prev);
11132     }
11133     __ ret(lr);
11134   }
11135 
11136   void gen_swpal_entry(Assembler::operand_size size) {
11137     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11138     __ swpal(size, incr, prev, addr);
11139     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11140     if (size == Assembler::xword) {
11141       __ mov(r0, prev);
11142     } else {
11143       __ movw(r0, prev);
11144     }
11145     __ ret(lr);
11146   }
11147 
11148   void generate_atomic_entry_points() {
11149     if (! UseLSE) {
11150       return;
11151     }
11152     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
11153     GrowableArray<address> entries;
11154     int entry_count = StubInfo::entry_count(stub_id);
11155     address start = load_archive_data(stub_id, &entries);
11156     if (start != nullptr) {
11157       assert(entries.length() == entry_count - 1,
11158              "unexpected extra entry count %d", entries.length());
11159       aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
11160       int idx = 0;
11161       aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11162       aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11163       aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11164       aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11165       aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11166       aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11167       aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11168       aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11169       aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11170       aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11171       aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11172       aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11173       aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11174       aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11175       aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11176       assert(idx == entries.length(), "sanity!");
11177       return;
11178     }
11179 
11180     __ align(CodeEntryAlignment);
11181     StubCodeMark mark(this, stub_id);
11182     start = __ pc();
11183     address end;
11184     {
11185     // ADD, memory_order_conservative
11186     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
11187     gen_ldadd_entry(Assembler::word, memory_order_conservative);
11188 
11189     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
11190     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
11191 
11192     // ADD, memory_order_relaxed
11193     AtomicStubMark mark_fetch_add_4_relaxed
11194       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
11195     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
11196 
11197     AtomicStubMark mark_fetch_add_8_relaxed
11198       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
11199     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
11200 
11201     // XCHG, memory_order_conservative
11202     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
11203     gen_swpal_entry(Assembler::word);
11204 
11205     AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
11206     gen_swpal_entry(Assembler::xword);
11207 
11208     // CAS, memory_order_conservative
11209     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
11210     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
11211 
11212     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
11213     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
11214 
11215     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
11216     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
11217 
11218     // CAS, memory_order_relaxed
11219     AtomicStubMark mark_cmpxchg_1_relaxed
11220       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
11221     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
11222 
11223     AtomicStubMark mark_cmpxchg_4_relaxed
11224       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
11225     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
11226 
11227     AtomicStubMark mark_cmpxchg_8_relaxed
11228       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
11229     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
11230 
11231     AtomicStubMark mark_cmpxchg_4_release
11232       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
11233     gen_cas_entry(MacroAssembler::word, memory_order_release);
11234 
11235     AtomicStubMark mark_cmpxchg_8_release
11236       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
11237     gen_cas_entry(MacroAssembler::xword, memory_order_release);
11238 
11239     AtomicStubMark mark_cmpxchg_4_seq_cst
11240       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
11241     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
11242 
11243     AtomicStubMark mark_cmpxchg_8_seq_cst
11244       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
11245     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
11246 
11247     end = __ pc();
11248 
11249     ICache::invalidate_range(start, end - start);
11250     // exit block to force update of AtomicStubMark targets
11251     }
11252 
11253     assert(start == (address)aarch64_atomic_fetch_add_4_impl,
11254            "atomic stub should be at start of buffer");
11255     // record the stub start and end plus all the entries saved by the
11256     // AtomicStubMark destructor
11257     entries.append((address)aarch64_atomic_fetch_add_8_impl);
11258     entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
11259     entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
11260     entries.append((address)aarch64_atomic_xchg_4_impl);
11261     entries.append((address)aarch64_atomic_xchg_8_impl);
11262     entries.append((address)aarch64_atomic_cmpxchg_1_impl);
11263     entries.append((address)aarch64_atomic_cmpxchg_4_impl);
11264     entries.append((address)aarch64_atomic_cmpxchg_8_impl);
11265     entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
11266     entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
11267     entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
11268     entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
11269     entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
11270     entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
11271     entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
11272 
11273     assert(entries.length() == entry_count - 1,
11274            "unexpected extra entry count %d", entries.length());
11275 
11276     store_archive_data(stub_id, start, end, &entries);
11277   }
11278 #endif // LINUX
11279 
11280   address generate_cont_thaw(Continuation::thaw_kind kind) {
11281     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
11282     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
11283 
11284     address start = __ pc();
11285 
11286     if (return_barrier) {
11287       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
11288       __ mov(sp, rscratch1);
11289     }
11290     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11291 
11292     if (return_barrier) {
11293       // preserve possible return value from a method returning to the return barrier
11294       __ fmovd(rscratch1, v0);
11295       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
11296     }
11297 
11298     __ movw(c_rarg1, (return_barrier ? 1 : 0));
11299     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
11300     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
11301 
11302     if (return_barrier) {
11303       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11304       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
11305       __ fmovd(v0, rscratch1);
11306     }
11307     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11308 
11309 
11310     Label thaw_success;
11311     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
11312     __ cbnz(rscratch2, thaw_success);
11313     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
11314     __ br(rscratch1);
11315     __ bind(thaw_success);
11316 
11317     // make room for the thawed frames
11318     __ sub(rscratch1, sp, rscratch2);
11319     __ andr(rscratch1, rscratch1, -16); // align
11320     __ mov(sp, rscratch1);
11321 
11322     if (return_barrier) {
11323       // save original return value -- again
11324       __ fmovd(rscratch1, v0);
11325       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
11326     }
11327 
11328     // If we want, we can templatize thaw by kind, and have three different entries
11329     __ movw(c_rarg1, (uint32_t)kind);
11330 
11331     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
11332     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
11333 
11334     if (return_barrier) {
11335       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11336       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
11337       __ fmovd(v0, rscratch1);
11338     } else {
11339       __ mov(r0, zr); // return 0 (success) from doYield
11340     }
11341 
11342     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
11343     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
11344     __ mov(rfp, sp);
11345 
11346     if (return_barrier_exception) {
11347       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
11348       __ authenticate_return_address(c_rarg1);
11349       __ verify_oop(r0);
11350       // save return value containing the exception oop in callee-saved R19
11351       __ mov(r19, r0);
11352 
11353       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
11354 
11355       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
11356       // __ reinitialize_ptrue();
11357 
11358       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
11359 
11360       __ mov(r1, r0); // the exception handler
11361       __ mov(r0, r19); // restore return value containing the exception oop
11362       __ verify_oop(r0);
11363 
11364       __ leave();
11365       __ mov(r3, lr);
11366       __ br(r1); // the exception handler
11367     } else {
11368       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
11369       __ leave();
11370       __ ret(lr);
11371     }
11372 
11373     return start;
11374   }
11375 
11376   address generate_cont_thaw() {
11377     if (!Continuations::enabled()) return nullptr;
11378 
11379     StubId stub_id = StubId::stubgen_cont_thaw_id;
11380     int entry_count = StubInfo::entry_count(stub_id);
11381     assert(entry_count == 1, "sanity check");
11382     address start = load_archive_data(stub_id);
11383     if (start != nullptr) {
11384       return start;
11385     }
11386     StubCodeMark mark(this, stub_id);
11387     start = __ pc();
11388     generate_cont_thaw(Continuation::thaw_top);
11389 
11390     // record the stub start and end
11391     store_archive_data(stub_id, start, __ pc());
11392 
11393     return start;
11394   }
11395 
11396   address generate_cont_returnBarrier() {
11397     if (!Continuations::enabled()) return nullptr;
11398 
11399     // TODO: will probably need multiple return barriers depending on return type
11400     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
11401     int entry_count = StubInfo::entry_count(stub_id);
11402     assert(entry_count == 1, "sanity check");
11403     address start = load_archive_data(stub_id);
11404     if (start != nullptr) {
11405       return start;
11406     }
11407     StubCodeMark mark(this, stub_id);
11408     start = __ pc();
11409 
11410     generate_cont_thaw(Continuation::thaw_return_barrier);
11411 
11412     // record the stub start and end
11413     store_archive_data(stub_id, start, __ pc());
11414 
11415     return start;
11416   }
11417 
11418   address generate_cont_returnBarrier_exception() {
11419     if (!Continuations::enabled()) return nullptr;
11420 
11421     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
11422     int entry_count = StubInfo::entry_count(stub_id);
11423     assert(entry_count == 1, "sanity check");
11424     address start = load_archive_data(stub_id);
11425     if (start != nullptr) {
11426       return start;
11427     }
11428     StubCodeMark mark(this, stub_id);
11429     start = __ pc();
11430 
11431     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
11432 
11433     // record the stub start and end
11434     store_archive_data(stub_id, start, __ pc());
11435 
11436     return start;
11437   }
11438 
11439   address generate_cont_preempt_stub() {
11440     if (!Continuations::enabled()) return nullptr;
11441     StubId stub_id = StubId::stubgen_cont_preempt_id;
11442     int entry_count = StubInfo::entry_count(stub_id);
11443     assert(entry_count == 1, "sanity check");
11444     address start = load_archive_data(stub_id);
11445     if (start != nullptr) {
11446       return start;
11447     }
11448     StubCodeMark mark(this, stub_id);
11449     start = __ pc();
11450 
11451     __ reset_last_Java_frame(true);
11452 
11453     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
11454     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
11455     __ mov(sp, rscratch2);
11456 
11457     Label preemption_cancelled;
11458     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
11459     __ cbnz(rscratch1, preemption_cancelled);
11460 
11461     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
11462     SharedRuntime::continuation_enter_cleanup(_masm);
11463     __ leave();
11464     __ ret(lr);
11465 
11466     // We acquired the monitor after freezing the frames so call thaw to continue execution.
11467     __ bind(preemption_cancelled);
11468     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
11469     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
11470     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
11471     __ ldr(rscratch1, Address(rscratch1));
11472     __ br(rscratch1);
11473 
11474     // record the stub start and end
11475     store_archive_data(stub_id, start, __ pc());
11476 
11477     return start;
11478   }
11479 
11480   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
11481   // are represented as long[5], with BITS_PER_LIMB = 26.
11482   // Pack five 26-bit limbs into three 64-bit registers.
11483   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
11484     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
11485     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
11486     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
11487     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
11488 
11489     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
11490     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
11491     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
11492     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
11493 
11494     if (dest2->is_valid()) {
11495       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
11496     } else {
11497 #ifdef ASSERT
11498       Label OK;
11499       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
11500       __ br(__ EQ, OK);
11501       __ stop("high bits of Poly1305 integer should be zero");
11502       __ should_not_reach_here();
11503       __ bind(OK);
11504 #endif
11505     }
11506   }
11507 
11508   // As above, but return only a 128-bit integer, packed into two
11509   // 64-bit registers.
11510   void pack_26(Register dest0, Register dest1, Register src) {
11511     pack_26(dest0, dest1, noreg, src);
11512   }
11513 
11514   // Multiply and multiply-accumulate unsigned 64-bit registers.
11515   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
11516     __ mul(prod_lo, n, m);
11517     __ umulh(prod_hi, n, m);
11518   }
11519   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
11520     wide_mul(rscratch1, rscratch2, n, m);
11521     __ adds(sum_lo, sum_lo, rscratch1);
11522     __ adc(sum_hi, sum_hi, rscratch2);
11523   }
11524 
11525   // Poly1305, RFC 7539
11526 
11527   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
11528   // description of the tricks used to simplify and accelerate this
11529   // computation.
11530 
11531   address generate_poly1305_processBlocks() {
11532     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
11533     int entry_count = StubInfo::entry_count(stub_id);
11534     assert(entry_count == 1, "sanity check");
11535     address start = load_archive_data(stub_id);
11536     if (start != nullptr) {
11537       return start;
11538     }
11539     __ align(CodeEntryAlignment);
11540     StubCodeMark mark(this, stub_id);
11541     start = __ pc();
11542     Label here;
11543     __ enter();
11544     RegSet callee_saved = RegSet::range(r19, r28);
11545     __ push(callee_saved, sp);
11546 
11547     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
11548 
11549     // Arguments
11550     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
11551 
11552     // R_n is the 128-bit randomly-generated key, packed into two
11553     // registers.  The caller passes this key to us as long[5], with
11554     // BITS_PER_LIMB = 26.
11555     const Register R_0 = *++regs, R_1 = *++regs;
11556     pack_26(R_0, R_1, r_start);
11557 
11558     // RR_n is (R_n >> 2) * 5
11559     const Register RR_0 = *++regs, RR_1 = *++regs;
11560     __ lsr(RR_0, R_0, 2);
11561     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
11562     __ lsr(RR_1, R_1, 2);
11563     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
11564 
11565     // U_n is the current checksum
11566     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
11567     pack_26(U_0, U_1, U_2, acc_start);
11568 
11569     static constexpr int BLOCK_LENGTH = 16;
11570     Label DONE, LOOP;
11571 
11572     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11573     __ br(Assembler::LT, DONE); {
11574       __ bind(LOOP);
11575 
11576       // S_n is to be the sum of U_n and the next block of data
11577       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
11578       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
11579       __ adds(S_0, U_0, S_0);
11580       __ adcs(S_1, U_1, S_1);
11581       __ adc(S_2, U_2, zr);
11582       __ add(S_2, S_2, 1);
11583 
11584       const Register U_0HI = *++regs, U_1HI = *++regs;
11585 
11586       // NB: this logic depends on some of the special properties of
11587       // Poly1305 keys. In particular, because we know that the top
11588       // four bits of R_0 and R_1 are zero, we can add together
11589       // partial products without any risk of needing to propagate a
11590       // carry out.
11591       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
11592       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
11593       __ andr(U_2, R_0, 3);
11594       __ mul(U_2, S_2, U_2);
11595 
11596       // Recycle registers S_0, S_1, S_2
11597       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
11598 
11599       // Partial reduction mod 2**130 - 5
11600       __ adds(U_1, U_0HI, U_1);
11601       __ adc(U_2, U_1HI, U_2);
11602       // Sum now in U_2:U_1:U_0.
11603       // Dead: U_0HI, U_1HI.
11604       regs = (regs.remaining() + U_0HI + U_1HI).begin();
11605 
11606       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
11607 
11608       // First, U_2:U_1:U_0 += (U_2 >> 2)
11609       __ lsr(rscratch1, U_2, 2);
11610       __ andr(U_2, U_2, (u8)3);
11611       __ adds(U_0, U_0, rscratch1);
11612       __ adcs(U_1, U_1, zr);
11613       __ adc(U_2, U_2, zr);
11614       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
11615       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
11616       __ adcs(U_1, U_1, zr);
11617       __ adc(U_2, U_2, zr);
11618 
11619       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
11620       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11621       __ br(~ Assembler::LT, LOOP);
11622     }
11623 
11624     // Further reduce modulo 2^130 - 5
11625     __ lsr(rscratch1, U_2, 2);
11626     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
11627     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
11628     __ adcs(U_1, U_1, zr);
11629     __ andr(U_2, U_2, (u1)3);
11630     __ adc(U_2, U_2, zr);
11631 
11632     // Unpack the sum into five 26-bit limbs and write to memory.
11633     __ ubfiz(rscratch1, U_0, 0, 26);
11634     __ ubfx(rscratch2, U_0, 26, 26);
11635     __ stp(rscratch1, rscratch2, Address(acc_start));
11636     __ ubfx(rscratch1, U_0, 52, 12);
11637     __ bfi(rscratch1, U_1, 12, 14);
11638     __ ubfx(rscratch2, U_1, 14, 26);
11639     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
11640     __ ubfx(rscratch1, U_1, 40, 24);
11641     __ bfi(rscratch1, U_2, 24, 3);
11642     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
11643 
11644     __ bind(DONE);
11645     __ pop(callee_saved, sp);
11646     __ leave();
11647     __ ret(lr);
11648 
11649     // record the stub start and end
11650     store_archive_data(stub_id, start, __ pc());
11651 
11652     return start;
11653   }
11654 
11655   // exception handler for upcall stubs
11656   address generate_upcall_stub_exception_handler() {
11657     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
11658     int entry_count = StubInfo::entry_count(stub_id);
11659     assert(entry_count == 1, "sanity check");
11660     address start = load_archive_data(stub_id);
11661     if (start != nullptr) {
11662       return start;
11663     }
11664     StubCodeMark mark(this, stub_id);
11665     start = __ pc();
11666 
11667     // Native caller has no idea how to handle exceptions,
11668     // so we just crash here. Up to callee to catch exceptions.
11669     __ verify_oop(r0);
11670     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
11671     __ blr(rscratch1);
11672     __ should_not_reach_here();
11673 
11674     // record the stub start and end
11675     store_archive_data(stub_id, start, __ pc());
11676 
11677     return start;
11678   }
11679 
11680   // load Method* target of MethodHandle
11681   // j_rarg0 = jobject receiver
11682   // rmethod = result
11683   address generate_upcall_stub_load_target() {
11684     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
11685     int entry_count = StubInfo::entry_count(stub_id);
11686     assert(entry_count == 1, "sanity check");
11687     address start = load_archive_data(stub_id);
11688     if (start != nullptr) {
11689       return start;
11690     }
11691     StubCodeMark mark(this, stub_id);
11692     start = __ pc();
11693 
11694     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
11695       // Load target method from receiver
11696     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
11697     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
11698     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
11699     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
11700                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
11701                       noreg, noreg);
11702     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
11703 
11704     __ ret(lr);
11705 
11706     // record the stub start and end
11707     store_archive_data(stub_id, start, __ pc());
11708 
11709     return start;
11710   }
11711 
11712 #undef __
11713 #define __ masm->
11714 
11715   class MontgomeryMultiplyGenerator : public MacroAssembler {
11716 
11717     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
11718       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
11719 
11720     RegSet _toSave;
11721     bool _squaring;
11722 
11723   public:
11724     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
11725       : MacroAssembler(as->code()), _squaring(squaring) {
11726 
11727       // Register allocation
11728 
11729       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
11730       Pa_base = *regs;       // Argument registers
11731       if (squaring)
11732         Pb_base = Pa_base;
11733       else
11734         Pb_base = *++regs;
11735       Pn_base = *++regs;
11736       Rlen= *++regs;
11737       inv = *++regs;
11738       Pm_base = *++regs;
11739 
11740                           // Working registers:
11741       Ra =  *++regs;        // The current digit of a, b, n, and m.
11742       Rb =  *++regs;
11743       Rm =  *++regs;
11744       Rn =  *++regs;
11745 
11746       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
11747       Pb =  *++regs;
11748       Pm =  *++regs;
11749       Pn =  *++regs;
11750 
11751       t0 =  *++regs;        // Three registers which form a
11752       t1 =  *++regs;        // triple-precision accumuator.
11753       t2 =  *++regs;
11754 
11755       Ri =  *++regs;        // Inner and outer loop indexes.
11756       Rj =  *++regs;
11757 
11758       Rhi_ab = *++regs;     // Product registers: low and high parts
11759       Rlo_ab = *++regs;     // of a*b and m*n.
11760       Rhi_mn = *++regs;
11761       Rlo_mn = *++regs;
11762 
11763       // r19 and up are callee-saved.
11764       _toSave = RegSet::range(r19, *regs) + Pm_base;
11765     }
11766 
11767   private:
11768     void save_regs() {
11769       push(_toSave, sp);
11770     }
11771 
11772     void restore_regs() {
11773       pop(_toSave, sp);
11774     }
11775 
11776     template <typename T>
11777     void unroll_2(Register count, T block) {
11778       Label loop, end, odd;
11779       tbnz(count, 0, odd);
11780       cbz(count, end);
11781       align(16);
11782       bind(loop);
11783       (this->*block)();
11784       bind(odd);
11785       (this->*block)();
11786       subs(count, count, 2);
11787       br(Assembler::GT, loop);
11788       bind(end);
11789     }
11790 
11791     template <typename T>
11792     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11793       Label loop, end, odd;
11794       tbnz(count, 0, odd);
11795       cbz(count, end);
11796       align(16);
11797       bind(loop);
11798       (this->*block)(d, s, tmp);
11799       bind(odd);
11800       (this->*block)(d, s, tmp);
11801       subs(count, count, 2);
11802       br(Assembler::GT, loop);
11803       bind(end);
11804     }
11805 
11806     void pre1(RegisterOrConstant i) {
11807       block_comment("pre1");
11808       // Pa = Pa_base;
11809       // Pb = Pb_base + i;
11810       // Pm = Pm_base;
11811       // Pn = Pn_base + i;
11812       // Ra = *Pa;
11813       // Rb = *Pb;
11814       // Rm = *Pm;
11815       // Rn = *Pn;
11816       ldr(Ra, Address(Pa_base));
11817       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11818       ldr(Rm, Address(Pm_base));
11819       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11820       lea(Pa, Address(Pa_base));
11821       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11822       lea(Pm, Address(Pm_base));
11823       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11824 
11825       // Zero the m*n result.
11826       mov(Rhi_mn, zr);
11827       mov(Rlo_mn, zr);
11828     }
11829 
11830     // The core multiply-accumulate step of a Montgomery
11831     // multiplication.  The idea is to schedule operations as a
11832     // pipeline so that instructions with long latencies (loads and
11833     // multiplies) have time to complete before their results are
11834     // used.  This most benefits in-order implementations of the
11835     // architecture but out-of-order ones also benefit.
11836     void step() {
11837       block_comment("step");
11838       // MACC(Ra, Rb, t0, t1, t2);
11839       // Ra = *++Pa;
11840       // Rb = *--Pb;
11841       umulh(Rhi_ab, Ra, Rb);
11842       mul(Rlo_ab, Ra, Rb);
11843       ldr(Ra, pre(Pa, wordSize));
11844       ldr(Rb, pre(Pb, -wordSize));
11845       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11846                                        // previous iteration.
11847       // MACC(Rm, Rn, t0, t1, t2);
11848       // Rm = *++Pm;
11849       // Rn = *--Pn;
11850       umulh(Rhi_mn, Rm, Rn);
11851       mul(Rlo_mn, Rm, Rn);
11852       ldr(Rm, pre(Pm, wordSize));
11853       ldr(Rn, pre(Pn, -wordSize));
11854       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11855     }
11856 
11857     void post1() {
11858       block_comment("post1");
11859 
11860       // MACC(Ra, Rb, t0, t1, t2);
11861       // Ra = *++Pa;
11862       // Rb = *--Pb;
11863       umulh(Rhi_ab, Ra, Rb);
11864       mul(Rlo_ab, Ra, Rb);
11865       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11866       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11867 
11868       // *Pm = Rm = t0 * inv;
11869       mul(Rm, t0, inv);
11870       str(Rm, Address(Pm));
11871 
11872       // MACC(Rm, Rn, t0, t1, t2);
11873       // t0 = t1; t1 = t2; t2 = 0;
11874       umulh(Rhi_mn, Rm, Rn);
11875 
11876 #ifndef PRODUCT
11877       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11878       {
11879         mul(Rlo_mn, Rm, Rn);
11880         add(Rlo_mn, t0, Rlo_mn);
11881         Label ok;
11882         cbz(Rlo_mn, ok); {
11883           stop("broken Montgomery multiply");
11884         } bind(ok);
11885       }
11886 #endif
11887       // We have very carefully set things up so that
11888       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11889       // the lower half of Rm * Rn because we know the result already:
11890       // it must be -t0.  t0 + (-t0) must generate a carry iff
11891       // t0 != 0.  So, rather than do a mul and an adds we just set
11892       // the carry flag iff t0 is nonzero.
11893       //
11894       // mul(Rlo_mn, Rm, Rn);
11895       // adds(zr, t0, Rlo_mn);
11896       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11897       adcs(t0, t1, Rhi_mn);
11898       adc(t1, t2, zr);
11899       mov(t2, zr);
11900     }
11901 
11902     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11903       block_comment("pre2");
11904       // Pa = Pa_base + i-len;
11905       // Pb = Pb_base + len;
11906       // Pm = Pm_base + i-len;
11907       // Pn = Pn_base + len;
11908 
11909       if (i.is_register()) {
11910         sub(Rj, i.as_register(), len);
11911       } else {
11912         mov(Rj, i.as_constant());
11913         sub(Rj, Rj, len);
11914       }
11915       // Rj == i-len
11916 
11917       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11918       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11919       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11920       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11921 
11922       // Ra = *++Pa;
11923       // Rb = *--Pb;
11924       // Rm = *++Pm;
11925       // Rn = *--Pn;
11926       ldr(Ra, pre(Pa, wordSize));
11927       ldr(Rb, pre(Pb, -wordSize));
11928       ldr(Rm, pre(Pm, wordSize));
11929       ldr(Rn, pre(Pn, -wordSize));
11930 
11931       mov(Rhi_mn, zr);
11932       mov(Rlo_mn, zr);
11933     }
11934 
11935     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11936       block_comment("post2");
11937       if (i.is_constant()) {
11938         mov(Rj, i.as_constant()-len.as_constant());
11939       } else {
11940         sub(Rj, i.as_register(), len);
11941       }
11942 
11943       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11944 
11945       // As soon as we know the least significant digit of our result,
11946       // store it.
11947       // Pm_base[i-len] = t0;
11948       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11949 
11950       // t0 = t1; t1 = t2; t2 = 0;
11951       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11952       adc(t1, t2, zr);
11953       mov(t2, zr);
11954     }
11955 
11956     // A carry in t0 after Montgomery multiplication means that we
11957     // should subtract multiples of n from our result in m.  We'll
11958     // keep doing that until there is no carry.
11959     void normalize(RegisterOrConstant len) {
11960       block_comment("normalize");
11961       // while (t0)
11962       //   t0 = sub(Pm_base, Pn_base, t0, len);
11963       Label loop, post, again;
11964       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11965       cbz(t0, post); {
11966         bind(again); {
11967           mov(i, zr);
11968           mov(cnt, len);
11969           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11970           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11971           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11972           align(16);
11973           bind(loop); {
11974             sbcs(Rm, Rm, Rn);
11975             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11976             add(i, i, 1);
11977             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11978             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11979             sub(cnt, cnt, 1);
11980           } cbnz(cnt, loop);
11981           sbc(t0, t0, zr);
11982         } cbnz(t0, again);
11983       } bind(post);
11984     }
11985 
11986     // Move memory at s to d, reversing words.
11987     //    Increments d to end of copied memory
11988     //    Destroys tmp1, tmp2
11989     //    Preserves len
11990     //    Leaves s pointing to the address which was in d at start
11991     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11992       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11993       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11994 
11995       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11996       mov(tmp1, len);
11997       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11998       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11999     }
12000     // where
12001     void reverse1(Register d, Register s, Register tmp) {
12002       ldr(tmp, pre(s, -wordSize));
12003       ror(tmp, tmp, 32);
12004       str(tmp, post(d, wordSize));
12005     }
12006 
12007     void step_squaring() {
12008       // An extra ACC
12009       step();
12010       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12011     }
12012 
12013     void last_squaring(RegisterOrConstant i) {
12014       Label dont;
12015       // if ((i & 1) == 0) {
12016       tbnz(i.as_register(), 0, dont); {
12017         // MACC(Ra, Rb, t0, t1, t2);
12018         // Ra = *++Pa;
12019         // Rb = *--Pb;
12020         umulh(Rhi_ab, Ra, Rb);
12021         mul(Rlo_ab, Ra, Rb);
12022         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12023       } bind(dont);
12024     }
12025 
12026     void extra_step_squaring() {
12027       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
12028 
12029       // MACC(Rm, Rn, t0, t1, t2);
12030       // Rm = *++Pm;
12031       // Rn = *--Pn;
12032       umulh(Rhi_mn, Rm, Rn);
12033       mul(Rlo_mn, Rm, Rn);
12034       ldr(Rm, pre(Pm, wordSize));
12035       ldr(Rn, pre(Pn, -wordSize));
12036     }
12037 
12038     void post1_squaring() {
12039       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
12040 
12041       // *Pm = Rm = t0 * inv;
12042       mul(Rm, t0, inv);
12043       str(Rm, Address(Pm));
12044 
12045       // MACC(Rm, Rn, t0, t1, t2);
12046       // t0 = t1; t1 = t2; t2 = 0;
12047       umulh(Rhi_mn, Rm, Rn);
12048 
12049 #ifndef PRODUCT
12050       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12051       {
12052         mul(Rlo_mn, Rm, Rn);
12053         add(Rlo_mn, t0, Rlo_mn);
12054         Label ok;
12055         cbz(Rlo_mn, ok); {
12056           stop("broken Montgomery multiply");
12057         } bind(ok);
12058       }
12059 #endif
12060       // We have very carefully set things up so that
12061       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
12062       // the lower half of Rm * Rn because we know the result already:
12063       // it must be -t0.  t0 + (-t0) must generate a carry iff
12064       // t0 != 0.  So, rather than do a mul and an adds we just set
12065       // the carry flag iff t0 is nonzero.
12066       //
12067       // mul(Rlo_mn, Rm, Rn);
12068       // adds(zr, t0, Rlo_mn);
12069       subs(zr, t0, 1); // Set carry iff t0 is nonzero
12070       adcs(t0, t1, Rhi_mn);
12071       adc(t1, t2, zr);
12072       mov(t2, zr);
12073     }
12074 
12075     void acc(Register Rhi, Register Rlo,
12076              Register t0, Register t1, Register t2) {
12077       adds(t0, t0, Rlo);
12078       adcs(t1, t1, Rhi);
12079       adc(t2, t2, zr);
12080     }
12081 
12082   public:
12083     /**
12084      * Fast Montgomery multiplication.  The derivation of the
12085      * algorithm is in A Cryptographic Library for the Motorola
12086      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
12087      *
12088      * Arguments:
12089      *
12090      * Inputs for multiplication:
12091      *   c_rarg0   - int array elements a
12092      *   c_rarg1   - int array elements b
12093      *   c_rarg2   - int array elements n (the modulus)
12094      *   c_rarg3   - int length
12095      *   c_rarg4   - int inv
12096      *   c_rarg5   - int array elements m (the result)
12097      *
12098      * Inputs for squaring:
12099      *   c_rarg0   - int array elements a
12100      *   c_rarg1   - int array elements n (the modulus)
12101      *   c_rarg2   - int length
12102      *   c_rarg3   - int inv
12103      *   c_rarg4   - int array elements m (the result)
12104      *
12105      */
12106     address generate_multiply() {
12107       Label argh, nothing;
12108 
12109       align(CodeEntryAlignment);
12110       address entry = pc();
12111 
12112       cbzw(Rlen, nothing);
12113 
12114       enter();
12115 
12116       // Make room.
12117       cmpw(Rlen, 512);
12118       br(Assembler::HI, argh);
12119       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12120       andr(sp, Ra, -2 * wordSize);
12121 
12122       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
12123 
12124       {
12125         // Copy input args, reversing as we go.  We use Ra as a
12126         // temporary variable.
12127         reverse(Ra, Pa_base, Rlen, t0, t1);
12128         if (!_squaring)
12129           reverse(Ra, Pb_base, Rlen, t0, t1);
12130         reverse(Ra, Pn_base, Rlen, t0, t1);
12131       }
12132 
12133       // Push all call-saved registers and also Pm_base which we'll need
12134       // at the end.
12135       save_regs();
12136 
12137 #ifndef PRODUCT
12138       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
12139       {
12140         ldr(Rn, Address(Pn_base, 0));
12141         mul(Rlo_mn, Rn, inv);
12142         subs(zr, Rlo_mn, -1);
12143         Label ok;
12144         br(EQ, ok); {
12145           stop("broken inverse in Montgomery multiply");
12146         } bind(ok);
12147       }
12148 #endif
12149 
12150       mov(Pm_base, Ra);
12151 
12152       mov(t0, zr);
12153       mov(t1, zr);
12154       mov(t2, zr);
12155 
12156       block_comment("for (int i = 0; i < len; i++) {");
12157       mov(Ri, zr); {
12158         Label loop, end;
12159         cmpw(Ri, Rlen);
12160         br(Assembler::GE, end);
12161 
12162         bind(loop);
12163         pre1(Ri);
12164 
12165         block_comment("  for (j = i; j; j--) {"); {
12166           movw(Rj, Ri);
12167           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12168         } block_comment("  } // j");
12169 
12170         post1();
12171         addw(Ri, Ri, 1);
12172         cmpw(Ri, Rlen);
12173         br(Assembler::LT, loop);
12174         bind(end);
12175         block_comment("} // i");
12176       }
12177 
12178       block_comment("for (int i = len; i < 2*len; i++) {");
12179       mov(Ri, Rlen); {
12180         Label loop, end;
12181         cmpw(Ri, Rlen, Assembler::LSL, 1);
12182         br(Assembler::GE, end);
12183 
12184         bind(loop);
12185         pre2(Ri, Rlen);
12186 
12187         block_comment("  for (j = len*2-i-1; j; j--) {"); {
12188           lslw(Rj, Rlen, 1);
12189           subw(Rj, Rj, Ri);
12190           subw(Rj, Rj, 1);
12191           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12192         } block_comment("  } // j");
12193 
12194         post2(Ri, Rlen);
12195         addw(Ri, Ri, 1);
12196         cmpw(Ri, Rlen, Assembler::LSL, 1);
12197         br(Assembler::LT, loop);
12198         bind(end);
12199       }
12200       block_comment("} // i");
12201 
12202       normalize(Rlen);
12203 
12204       mov(Ra, Pm_base);  // Save Pm_base in Ra
12205       restore_regs();  // Restore caller's Pm_base
12206 
12207       // Copy our result into caller's Pm_base
12208       reverse(Pm_base, Ra, Rlen, t0, t1);
12209 
12210       leave();
12211       bind(nothing);
12212       ret(lr);
12213 
12214       // handler for error case
12215       bind(argh);
12216       stop("MontgomeryMultiply total_allocation must be <= 8192");
12217 
12218       return entry;
12219     }
12220     // In C, approximately:
12221 
12222     // void
12223     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
12224     //                     julong Pn_base[], julong Pm_base[],
12225     //                     julong inv, int len) {
12226     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12227     //   julong *Pa, *Pb, *Pn, *Pm;
12228     //   julong Ra, Rb, Rn, Rm;
12229 
12230     //   int i;
12231 
12232     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12233 
12234     //   for (i = 0; i < len; i++) {
12235     //     int j;
12236 
12237     //     Pa = Pa_base;
12238     //     Pb = Pb_base + i;
12239     //     Pm = Pm_base;
12240     //     Pn = Pn_base + i;
12241 
12242     //     Ra = *Pa;
12243     //     Rb = *Pb;
12244     //     Rm = *Pm;
12245     //     Rn = *Pn;
12246 
12247     //     int iters = i;
12248     //     for (j = 0; iters--; j++) {
12249     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12250     //       MACC(Ra, Rb, t0, t1, t2);
12251     //       Ra = *++Pa;
12252     //       Rb = *--Pb;
12253     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12254     //       MACC(Rm, Rn, t0, t1, t2);
12255     //       Rm = *++Pm;
12256     //       Rn = *--Pn;
12257     //     }
12258 
12259     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
12260     //     MACC(Ra, Rb, t0, t1, t2);
12261     //     *Pm = Rm = t0 * inv;
12262     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12263     //     MACC(Rm, Rn, t0, t1, t2);
12264 
12265     //     assert(t0 == 0, "broken Montgomery multiply");
12266 
12267     //     t0 = t1; t1 = t2; t2 = 0;
12268     //   }
12269 
12270     //   for (i = len; i < 2*len; i++) {
12271     //     int j;
12272 
12273     //     Pa = Pa_base + i-len;
12274     //     Pb = Pb_base + len;
12275     //     Pm = Pm_base + i-len;
12276     //     Pn = Pn_base + len;
12277 
12278     //     Ra = *++Pa;
12279     //     Rb = *--Pb;
12280     //     Rm = *++Pm;
12281     //     Rn = *--Pn;
12282 
12283     //     int iters = len*2-i-1;
12284     //     for (j = i-len+1; iters--; j++) {
12285     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12286     //       MACC(Ra, Rb, t0, t1, t2);
12287     //       Ra = *++Pa;
12288     //       Rb = *--Pb;
12289     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12290     //       MACC(Rm, Rn, t0, t1, t2);
12291     //       Rm = *++Pm;
12292     //       Rn = *--Pn;
12293     //     }
12294 
12295     //     Pm_base[i-len] = t0;
12296     //     t0 = t1; t1 = t2; t2 = 0;
12297     //   }
12298 
12299     //   while (t0)
12300     //     t0 = sub(Pm_base, Pn_base, t0, len);
12301     // }
12302 
12303     /**
12304      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
12305      * multiplies than Montgomery multiplication so it should be up to
12306      * 25% faster.  However, its loop control is more complex and it
12307      * may actually run slower on some machines.
12308      *
12309      * Arguments:
12310      *
12311      * Inputs:
12312      *   c_rarg0   - int array elements a
12313      *   c_rarg1   - int array elements n (the modulus)
12314      *   c_rarg2   - int length
12315      *   c_rarg3   - int inv
12316      *   c_rarg4   - int array elements m (the result)
12317      *
12318      */
12319     address generate_square() {
12320       Label argh;
12321 
12322       align(CodeEntryAlignment);
12323       address entry = pc();
12324 
12325       enter();
12326 
12327       // Make room.
12328       cmpw(Rlen, 512);
12329       br(Assembler::HI, argh);
12330       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12331       andr(sp, Ra, -2 * wordSize);
12332 
12333       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
12334 
12335       {
12336         // Copy input args, reversing as we go.  We use Ra as a
12337         // temporary variable.
12338         reverse(Ra, Pa_base, Rlen, t0, t1);
12339         reverse(Ra, Pn_base, Rlen, t0, t1);
12340       }
12341 
12342       // Push all call-saved registers and also Pm_base which we'll need
12343       // at the end.
12344       save_regs();
12345 
12346       mov(Pm_base, Ra);
12347 
12348       mov(t0, zr);
12349       mov(t1, zr);
12350       mov(t2, zr);
12351 
12352       block_comment("for (int i = 0; i < len; i++) {");
12353       mov(Ri, zr); {
12354         Label loop, end;
12355         bind(loop);
12356         cmp(Ri, Rlen);
12357         br(Assembler::GE, end);
12358 
12359         pre1(Ri);
12360 
12361         block_comment("for (j = (i+1)/2; j; j--) {"); {
12362           add(Rj, Ri, 1);
12363           lsr(Rj, Rj, 1);
12364           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12365         } block_comment("  } // j");
12366 
12367         last_squaring(Ri);
12368 
12369         block_comment("  for (j = i/2; j; j--) {"); {
12370           lsr(Rj, Ri, 1);
12371           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12372         } block_comment("  } // j");
12373 
12374         post1_squaring();
12375         add(Ri, Ri, 1);
12376         cmp(Ri, Rlen);
12377         br(Assembler::LT, loop);
12378 
12379         bind(end);
12380         block_comment("} // i");
12381       }
12382 
12383       block_comment("for (int i = len; i < 2*len; i++) {");
12384       mov(Ri, Rlen); {
12385         Label loop, end;
12386         bind(loop);
12387         cmp(Ri, Rlen, Assembler::LSL, 1);
12388         br(Assembler::GE, end);
12389 
12390         pre2(Ri, Rlen);
12391 
12392         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
12393           lsl(Rj, Rlen, 1);
12394           sub(Rj, Rj, Ri);
12395           sub(Rj, Rj, 1);
12396           lsr(Rj, Rj, 1);
12397           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12398         } block_comment("  } // j");
12399 
12400         last_squaring(Ri);
12401 
12402         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
12403           lsl(Rj, Rlen, 1);
12404           sub(Rj, Rj, Ri);
12405           lsr(Rj, Rj, 1);
12406           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12407         } block_comment("  } // j");
12408 
12409         post2(Ri, Rlen);
12410         add(Ri, Ri, 1);
12411         cmp(Ri, Rlen, Assembler::LSL, 1);
12412 
12413         br(Assembler::LT, loop);
12414         bind(end);
12415         block_comment("} // i");
12416       }
12417 
12418       normalize(Rlen);
12419 
12420       mov(Ra, Pm_base);  // Save Pm_base in Ra
12421       restore_regs();  // Restore caller's Pm_base
12422 
12423       // Copy our result into caller's Pm_base
12424       reverse(Pm_base, Ra, Rlen, t0, t1);
12425 
12426       leave();
12427       ret(lr);
12428 
12429       // handler for error case
12430       bind(argh);
12431       stop("MontgomeryMultiply total_allocation must be <= 8192");
12432 
12433       return entry;
12434     }
12435     // In C, approximately:
12436 
12437     // void
12438     // montgomery_square(julong Pa_base[], julong Pn_base[],
12439     //                   julong Pm_base[], julong inv, int len) {
12440     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12441     //   julong *Pa, *Pb, *Pn, *Pm;
12442     //   julong Ra, Rb, Rn, Rm;
12443 
12444     //   int i;
12445 
12446     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12447 
12448     //   for (i = 0; i < len; i++) {
12449     //     int j;
12450 
12451     //     Pa = Pa_base;
12452     //     Pb = Pa_base + i;
12453     //     Pm = Pm_base;
12454     //     Pn = Pn_base + i;
12455 
12456     //     Ra = *Pa;
12457     //     Rb = *Pb;
12458     //     Rm = *Pm;
12459     //     Rn = *Pn;
12460 
12461     //     int iters = (i+1)/2;
12462     //     for (j = 0; iters--; j++) {
12463     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12464     //       MACC2(Ra, Rb, t0, t1, t2);
12465     //       Ra = *++Pa;
12466     //       Rb = *--Pb;
12467     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12468     //       MACC(Rm, Rn, t0, t1, t2);
12469     //       Rm = *++Pm;
12470     //       Rn = *--Pn;
12471     //     }
12472     //     if ((i & 1) == 0) {
12473     //       assert(Ra == Pa_base[j], "must be");
12474     //       MACC(Ra, Ra, t0, t1, t2);
12475     //     }
12476     //     iters = i/2;
12477     //     assert(iters == i-j, "must be");
12478     //     for (; iters--; j++) {
12479     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12480     //       MACC(Rm, Rn, t0, t1, t2);
12481     //       Rm = *++Pm;
12482     //       Rn = *--Pn;
12483     //     }
12484 
12485     //     *Pm = Rm = t0 * inv;
12486     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12487     //     MACC(Rm, Rn, t0, t1, t2);
12488 
12489     //     assert(t0 == 0, "broken Montgomery multiply");
12490 
12491     //     t0 = t1; t1 = t2; t2 = 0;
12492     //   }
12493 
12494     //   for (i = len; i < 2*len; i++) {
12495     //     int start = i-len+1;
12496     //     int end = start + (len - start)/2;
12497     //     int j;
12498 
12499     //     Pa = Pa_base + i-len;
12500     //     Pb = Pa_base + len;
12501     //     Pm = Pm_base + i-len;
12502     //     Pn = Pn_base + len;
12503 
12504     //     Ra = *++Pa;
12505     //     Rb = *--Pb;
12506     //     Rm = *++Pm;
12507     //     Rn = *--Pn;
12508 
12509     //     int iters = (2*len-i-1)/2;
12510     //     assert(iters == end-start, "must be");
12511     //     for (j = start; iters--; j++) {
12512     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12513     //       MACC2(Ra, Rb, t0, t1, t2);
12514     //       Ra = *++Pa;
12515     //       Rb = *--Pb;
12516     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12517     //       MACC(Rm, Rn, t0, t1, t2);
12518     //       Rm = *++Pm;
12519     //       Rn = *--Pn;
12520     //     }
12521     //     if ((i & 1) == 0) {
12522     //       assert(Ra == Pa_base[j], "must be");
12523     //       MACC(Ra, Ra, t0, t1, t2);
12524     //     }
12525     //     iters =  (2*len-i)/2;
12526     //     assert(iters == len-j, "must be");
12527     //     for (; iters--; j++) {
12528     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12529     //       MACC(Rm, Rn, t0, t1, t2);
12530     //       Rm = *++Pm;
12531     //       Rn = *--Pn;
12532     //     }
12533     //     Pm_base[i-len] = t0;
12534     //     t0 = t1; t1 = t2; t2 = 0;
12535     //   }
12536 
12537     //   while (t0)
12538     //     t0 = sub(Pm_base, Pn_base, t0, len);
12539     // }
12540   };
12541 
12542   // Initialization
12543   void generate_preuniverse_stubs() {
12544     // preuniverse stubs are not needed for aarch64
12545   }
12546 
12547   void generate_initial_stubs() {
12548     // Generate initial stubs and initializes the entry points
12549 
12550     // entry points that exist in all platforms Note: This is code
12551     // that could be shared among different platforms - however the
12552     // benefit seems to be smaller than the disadvantage of having a
12553     // much more complicated generator structure. See also comment in
12554     // stubRoutines.hpp.
12555 
12556     StubRoutines::_forward_exception_entry = generate_forward_exception();
12557 
12558     StubRoutines::_call_stub_entry =
12559       generate_call_stub(StubRoutines::_call_stub_return_address);
12560 
12561     // is referenced by megamorphic call
12562     StubRoutines::_catch_exception_entry = generate_catch_exception();
12563 
12564     // Initialize table for copy memory (arraycopy) check.
12565     if (UnsafeMemoryAccess::_table == nullptr) {
12566       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
12567     }
12568 
12569     if (UseCRC32Intrinsics) {
12570       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
12571     }
12572 
12573     if (UseCRC32CIntrinsics) {
12574       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
12575     }
12576 
12577     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
12578       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
12579     }
12580 
12581     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
12582       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
12583     }
12584 
12585     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
12586         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
12587       StubRoutines::_hf2f = generate_float16ToFloat();
12588       StubRoutines::_f2hf = generate_floatToFloat16();
12589     }
12590   }
12591 
12592   void generate_continuation_stubs() {
12593     // Continuation stubs:
12594     StubRoutines::_cont_thaw          = generate_cont_thaw();
12595     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
12596     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
12597     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
12598   }
12599 
12600   void generate_final_stubs() {
12601     // support for verify_oop (must happen after universe_init)
12602     if (VerifyOops) {
12603       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
12604     }
12605 
12606     // arraycopy stubs used by compilers
12607     generate_arraycopy_stubs();
12608 
12609     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
12610 
12611     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
12612 
12613     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
12614     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
12615 
12616 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12617 
12618     generate_atomic_entry_points();
12619 
12620 #endif // LINUX
12621 
12622 #ifdef COMPILER2
12623     if (UseSecondarySupersTable) {
12624       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
12625       if (! InlineSecondarySupersTest) {
12626         generate_lookup_secondary_supers_table_stub();
12627       }
12628     }
12629 #endif
12630 
12631     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
12632       StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
12633     }
12634 
12635     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
12636   }
12637 
12638   void generate_compiler_stubs() {
12639 #if COMPILER2_OR_JVMCI
12640 
12641     if (UseSVE == 0) {
12642       generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
12643     }
12644 
12645     // array equals stub for large arrays.
12646     if (!UseSimpleArrayEquals) {
12647       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12648     }
12649 
12650     // arrays_hascode stub for large arrays.
12651     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12652     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12653     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12654     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12655     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12656 
12657     // byte_array_inflate stub for large arrays.
12658     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12659 
12660     // countPositives stub for large arrays.
12661     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12662 
12663     generate_compare_long_strings();
12664 
12665     generate_string_indexof_stubs();
12666 
12667 #ifdef COMPILER2
12668     if (UseMultiplyToLenIntrinsic) {
12669       StubRoutines::_multiplyToLen = generate_multiplyToLen();
12670     }
12671 
12672     if (UseSquareToLenIntrinsic) {
12673       StubRoutines::_squareToLen = generate_squareToLen();
12674     }
12675 
12676     if (UseMulAddIntrinsic) {
12677       StubRoutines::_mulAdd = generate_mulAdd();
12678     }
12679 
12680     if (UseSIMDForBigIntegerShiftIntrinsics) {
12681       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12682       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
12683     }
12684 
12685     if (UseMontgomeryMultiplyIntrinsic) {
12686       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12687       address start = load_archive_data(stub_id);
12688       if (start == nullptr) {
12689         // we have to generate it
12690         StubCodeMark mark(this, stub_id);
12691         MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12692         start = g.generate_multiply();
12693         // record the stub start and end
12694         store_archive_data(stub_id, start, _masm->pc());
12695       }
12696       StubRoutines::_montgomeryMultiply = start;
12697     }
12698 
12699     if (UseMontgomerySquareIntrinsic) {
12700       StubId stub_id = StubId::stubgen_montgomerySquare_id;
12701       address start = load_archive_data(stub_id);
12702       if (start == nullptr) {
12703         // we have to generate it
12704         StubCodeMark mark(this, stub_id);
12705         MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12706         // We use generate_multiply() rather than generate_square()
12707         // because it's faster for the sizes of modulus we care about.
12708         start = g.generate_multiply();
12709         // record the stub start and end
12710         store_archive_data(stub_id, start, _masm->pc());
12711       }
12712       StubRoutines::_montgomerySquare = start;
12713     }
12714 
12715 #endif // COMPILER2
12716 
12717     if (UseChaCha20Intrinsics) {
12718       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12719     }
12720 
12721     if (UseKyberIntrinsics) {
12722       StubRoutines::_kyberNtt = generate_kyberNtt();
12723       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12724       StubRoutines::_kyberNttMult = generate_kyberNttMult();
12725       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12726       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12727       StubRoutines::_kyber12To16 = generate_kyber12To16();
12728       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12729     }
12730 
12731     if (UseDilithiumIntrinsics) {
12732       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12733       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12734       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12735       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12736       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12737     }
12738 
12739     if (UseBASE64Intrinsics) {
12740         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12741         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12742     }
12743 
12744     // data cache line writeback
12745     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12746     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12747 
12748     if (UseAESIntrinsics) {
12749       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12750       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12751       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12752       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12753       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12754     }
12755     if (UseGHASHIntrinsics) {
12756       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12757       StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
12758       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
12759     }
12760     if (UseAESIntrinsics && UseGHASHIntrinsics) {
12761       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12762     }
12763 
12764     if (UseMD5Intrinsics) {
12765       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12766       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12767     }
12768     if (UseSHA1Intrinsics) {
12769       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12770       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12771     }
12772     if (UseSHA256Intrinsics) {
12773       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12774       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12775     }
12776     if (UseSHA512Intrinsics) {
12777       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12778       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12779     }
12780     if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
12781       StubRoutines::_double_keccak         = generate_double_keccak();
12782       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12783       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12784     } else if (UseSHA3Intrinsics) {
12785       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12786       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12787     }
12788 
12789     if (UsePoly1305Intrinsics) {
12790       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12791     }
12792 
12793     // generate Adler32 intrinsics code
12794     if (UseAdler32Intrinsics) {
12795       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12796     }
12797 
12798 #endif // COMPILER2_OR_JVMCI
12799   }
12800 
12801  public:
12802   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
12803     switch(blob_id) {
12804     case BlobId::stubgen_preuniverse_id:
12805       generate_preuniverse_stubs();
12806       break;
12807     case BlobId::stubgen_initial_id:
12808       generate_initial_stubs();
12809       break;
12810      case BlobId::stubgen_continuation_id:
12811       generate_continuation_stubs();
12812       break;
12813     case BlobId::stubgen_compiler_id:
12814       generate_compiler_stubs();
12815       break;
12816     case BlobId::stubgen_final_id:
12817       generate_final_stubs();
12818       break;
12819     default:
12820       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12821       break;
12822     };
12823   }
12824 
12825 #if INCLUDE_CDS
12826   static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
12827     // external data defined in this file
12828 #define ADD(addr) external_addresses.append((address)(addr));
12829     ADD(_sha256_round_consts);
12830     ADD(_sha512_round_consts);
12831     ADD(_sha3_round_consts);
12832     ADD(_double_keccak_round_consts);
12833     ADD(_encodeBlock_toBase64);
12834     ADD(_encodeBlock_toBase64URL);
12835     ADD(_decodeBlock_fromBase64ForNoSIMD);
12836     ADD(_decodeBlock_fromBase64URLForNoSIMD);
12837     ADD(_decodeBlock_fromBase64ForSIMD);
12838     ADD(_decodeBlock_fromBase64URLForSIMD);
12839 #undef ADD
12840   }
12841 #endif // INCLUDE_CDS
12842 }; // end class declaration
12843 
12844 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
12845   StubGenerator g(code, blob_id, stub_data);
12846 }
12847 
12848 #if INCLUDE_CDS
12849 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
12850   StubGenerator::init_AOTAddressTable(addresses);
12851 }
12852 #endif // INCLUDE_CDS
12853 
12854 #if defined (LINUX)
12855 
12856 // Define pointers to atomic stubs and initialize them to point to the
12857 // code in atomic_aarch64.S.
12858 
12859 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12860   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12861     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12862   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12863     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12864 
12865 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12866 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12867 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12868 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12869 DEFAULT_ATOMIC_OP(xchg, 4, )
12870 DEFAULT_ATOMIC_OP(xchg, 8, )
12871 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12872 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12873 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12874 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12875 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12876 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12877 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12878 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12879 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12880 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12881 
12882 #undef DEFAULT_ATOMIC_OP
12883 
12884 #endif // LINUX