1 /*
    2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Constant data definitions
   83 
   84 static const uint32_t _sha256_round_consts[64] = {
   85   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   86   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   87   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
   88   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
   89   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
   90   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
   91   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
   92   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
   93   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
   94   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
   95   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
   96   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
   97   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
   98   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
   99   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  100   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  101 };
  102 
  103 static const uint64_t _sha512_round_consts[80] = {
  104   0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
  105   0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
  106   0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
  107   0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
  108   0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
  109   0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
  110   0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
  111   0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
  112   0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
  113   0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
  114   0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
  115   0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
  116   0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
  117   0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
  118   0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
  119   0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
  120   0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
  121   0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
  122   0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
  123   0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
  124   0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
  125   0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
  126   0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
  127   0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
  128   0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
  129   0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
  130   0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
  131 };
  132 
  133 static const uint64_t _sha3_round_consts[24] = {
  134   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  135   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  136   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  137   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  138   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  139   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  140   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  141   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  142 };
  143 
  144 static const uint64_t _double_keccak_round_consts[24] = {
  145   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  146   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  147   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  148   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  149   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  150   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  151   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  152   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  153 };
  154 
  155 static const char _encodeBlock_toBase64[64] = {
  156   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  157   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  158   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  159   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  160   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
  161 };
  162 
  163 static const char _encodeBlock_toBase64URL[64] = {
  164   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  165   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  166   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  167   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  168   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
  169 };
  170 
  171 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
  172 // except the trailing character '=' is also treated illegal value in this intrinsic. That
  173 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
  174 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
  175   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  176   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  177   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  178   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  179   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  180   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
  181   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  182   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  183   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  184   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  185   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  186   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  187   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  188   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  189   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  190   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  191 };
  192 
  193 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
  194   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  195   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  196   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  197   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  198   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  199   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
  200   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  201   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  202   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  203   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  204   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  205   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  206   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  207   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  208   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  209   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  210 };
  211 
  212 // A legal value of base64 code is in range [0, 127].  We need two lookups
  213 // with tbl/tbx and combine them to get the decode data. The 1st table vector
  214 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
  215 // table vector lookup use tbx, out of range indices are unchanged in
  216 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
  217 // The value of index 64 is set to 0, so that we know that we already get the
  218 // decoded data with the 1st lookup.
  219 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
  220   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  221   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  222   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  223   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  224   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  225   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  226   255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  227   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  228 };
  229 
  230 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
  231   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  232   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  233   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  234   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  235   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  236   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  237   63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  238   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  239 };
  240 
  241 
  242 // Stub Code definitions
  243 
  244 class StubGenerator: public StubCodeGenerator {
  245  private:
  246 
  247 #ifdef PRODUCT
  248 #define inc_counter_np(counter) ((void)0)
  249 #else
  250   void inc_counter_np_(uint& counter) {
  251     __ incrementw(ExternalAddress((address)&counter));
  252   }
  253 #define inc_counter_np(counter) \
  254   BLOCK_COMMENT("inc_counter " #counter); \
  255   inc_counter_np_(counter);
  256 #endif
  257 
  258   // Call stubs are used to call Java from C
  259   //
  260   // Arguments:
  261   //    c_rarg0:   call wrapper address                   address
  262   //    c_rarg1:   result                                 address
  263   //    c_rarg2:   result type                            BasicType
  264   //    c_rarg3:   method                                 Method*
  265   //    c_rarg4:   (interpreter) entry point              address
  266   //    c_rarg5:   parameters                             intptr_t*
  267   //    c_rarg6:   parameter size (in words)              int
  268   //    c_rarg7:   thread                                 Thread*
  269   //
  270   // There is no return from the stub itself as any Java result
  271   // is written to result
  272   //
  273   // we save r30 (lr) as the return PC at the base of the frame and
  274   // link r29 (fp) below it as the frame pointer installing sp (r31)
  275   // into fp.
  276   //
  277   // we save r0-r7, which accounts for all the c arguments.
  278   //
  279   // TODO: strictly do we need to save them all? they are treated as
  280   // volatile by C so could we omit saving the ones we are going to
  281   // place in global registers (thread? method?) or those we only use
  282   // during setup of the Java call?
  283   //
  284   // we don't need to save r8 which C uses as an indirect result location
  285   // return register.
  286   //
  287   // we don't need to save r9-r15 which both C and Java treat as
  288   // volatile
  289   //
  290   // we don't need to save r16-18 because Java does not use them
  291   //
  292   // we save r19-r28 which Java uses as scratch registers and C
  293   // expects to be callee-save
  294   //
  295   // we save the bottom 64 bits of each value stored in v8-v15; it is
  296   // the responsibility of the caller to preserve larger values.
  297   //
  298   // so the stub frame looks like this when we enter Java code
  299   //
  300   //     [ return_from_Java     ] <--- sp
  301   //     [ argument word n      ]
  302   //      ...
  303   // -29 [ argument word 1      ]
  304   // -28 [ saved Floating-point Control Register ]
  305   // -26 [ saved v15            ] <--- sp_after_call
  306   // -25 [ saved v14            ]
  307   // -24 [ saved v13            ]
  308   // -23 [ saved v12            ]
  309   // -22 [ saved v11            ]
  310   // -21 [ saved v10            ]
  311   // -20 [ saved v9             ]
  312   // -19 [ saved v8             ]
  313   // -18 [ saved r28            ]
  314   // -17 [ saved r27            ]
  315   // -16 [ saved r26            ]
  316   // -15 [ saved r25            ]
  317   // -14 [ saved r24            ]
  318   // -13 [ saved r23            ]
  319   // -12 [ saved r22            ]
  320   // -11 [ saved r21            ]
  321   // -10 [ saved r20            ]
  322   //  -9 [ saved r19            ]
  323   //  -8 [ call wrapper    (r0) ]
  324   //  -7 [ result          (r1) ]
  325   //  -6 [ result type     (r2) ]
  326   //  -5 [ method          (r3) ]
  327   //  -4 [ entry point     (r4) ]
  328   //  -3 [ parameters      (r5) ]
  329   //  -2 [ parameter size  (r6) ]
  330   //  -1 [ thread (r7)          ]
  331   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  332   //   1 [ saved lr       (r30) ]
  333 
  334   // Call stub stack layout word offsets from fp
  335   enum call_stub_layout {
  336     sp_after_call_off  = -28,
  337 
  338     fpcr_off           = sp_after_call_off,
  339     d15_off            = -26,
  340     d13_off            = -24,
  341     d11_off            = -22,
  342     d9_off             = -20,
  343 
  344     r28_off            = -18,
  345     r26_off            = -16,
  346     r24_off            = -14,
  347     r22_off            = -12,
  348     r20_off            = -10,
  349     call_wrapper_off   =  -8,
  350     result_off         =  -7,
  351     result_type_off    =  -6,
  352     method_off         =  -5,
  353     entry_point_off    =  -4,
  354     parameter_size_off =  -2,
  355     thread_off         =  -1,
  356     fp_f               =   0,
  357     retaddr_off        =   1,
  358   };
  359 
  360   address generate_call_stub(address& return_address) {
  361     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  362            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  363            "adjust this code");
  364 
  365     StubId stub_id = StubId::stubgen_call_stub_id;
  366     GrowableArray<address> entries;
  367     int entry_count = StubInfo::entry_count(stub_id);
  368     assert(entry_count == 2, "sanity check");
  369     address start = load_archive_data(stub_id, &entries);
  370     if (start != nullptr) {
  371       assert(entries.length() == 1, "expected 1 extra entry");
  372       return_address = entries.at(0);
  373       return start;
  374     }
  375     StubCodeMark mark(this, stub_id);
  376     start = __ pc();
  377 
  378     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  379 
  380     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  381     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  382     const Address result        (rfp, result_off         * wordSize);
  383     const Address result_type   (rfp, result_type_off    * wordSize);
  384     const Address method        (rfp, method_off         * wordSize);
  385     const Address entry_point   (rfp, entry_point_off    * wordSize);
  386     const Address parameter_size(rfp, parameter_size_off * wordSize);
  387 
  388     const Address thread        (rfp, thread_off         * wordSize);
  389 
  390     const Address d15_save      (rfp, d15_off * wordSize);
  391     const Address d13_save      (rfp, d13_off * wordSize);
  392     const Address d11_save      (rfp, d11_off * wordSize);
  393     const Address d9_save       (rfp, d9_off * wordSize);
  394 
  395     const Address r28_save      (rfp, r28_off * wordSize);
  396     const Address r26_save      (rfp, r26_off * wordSize);
  397     const Address r24_save      (rfp, r24_off * wordSize);
  398     const Address r22_save      (rfp, r22_off * wordSize);
  399     const Address r20_save      (rfp, r20_off * wordSize);
  400 
  401     // stub code
  402 
  403     address aarch64_entry = __ pc();
  404 
  405     // set up frame and move sp to end of save area
  406     __ enter();
  407     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  408 
  409     // save register parameters and Java scratch/global registers
  410     // n.b. we save thread even though it gets installed in
  411     // rthread because we want to sanity check rthread later
  412     __ str(c_rarg7,  thread);
  413     __ strw(c_rarg6, parameter_size);
  414     __ stp(c_rarg4, c_rarg5,  entry_point);
  415     __ stp(c_rarg2, c_rarg3,  result_type);
  416     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  417 
  418     __ stp(r20, r19,   r20_save);
  419     __ stp(r22, r21,   r22_save);
  420     __ stp(r24, r23,   r24_save);
  421     __ stp(r26, r25,   r26_save);
  422     __ stp(r28, r27,   r28_save);
  423 
  424     __ stpd(v9,  v8,   d9_save);
  425     __ stpd(v11, v10,  d11_save);
  426     __ stpd(v13, v12,  d13_save);
  427     __ stpd(v15, v14,  d15_save);
  428 
  429     __ get_fpcr(rscratch1);
  430     __ str(rscratch1, fpcr_save);
  431     // Set FPCR to the state we need. We do want Round to Nearest. We
  432     // don't want non-IEEE rounding modes or floating-point traps.
  433     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  434     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  435     __ set_fpcr(rscratch1);
  436 
  437     // install Java thread in global register now we have saved
  438     // whatever value it held
  439     __ mov(rthread, c_rarg7);
  440     // And method
  441     __ mov(rmethod, c_rarg3);
  442 
  443     // set up the heapbase register
  444     __ reinit_heapbase();
  445 
  446 #ifdef ASSERT
  447     // make sure we have no pending exceptions
  448     {
  449       Label L;
  450       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  451       __ cmp(rscratch1, (u1)NULL_WORD);
  452       __ br(Assembler::EQ, L);
  453       __ stop("StubRoutines::call_stub: entered with pending exception");
  454       __ BIND(L);
  455     }
  456 #endif
  457     // pass parameters if any
  458     __ mov(esp, sp);
  459     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  460     __ andr(sp, rscratch1, -2 * wordSize);
  461 
  462     BLOCK_COMMENT("pass parameters if any");
  463     Label parameters_done;
  464     // parameter count is still in c_rarg6
  465     // and parameter pointer identifying param 1 is in c_rarg5
  466     __ cbzw(c_rarg6, parameters_done);
  467 
  468     address loop = __ pc();
  469     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  470     __ subsw(c_rarg6, c_rarg6, 1);
  471     __ push(rscratch1);
  472     __ br(Assembler::GT, loop);
  473 
  474     __ BIND(parameters_done);
  475 
  476     // call Java entry -- passing methdoOop, and current sp
  477     //      rmethod: Method*
  478     //      r19_sender_sp: sender sp
  479     BLOCK_COMMENT("call Java function");
  480     __ mov(r19_sender_sp, sp);
  481     __ blr(c_rarg4);
  482 
  483     // we do this here because the notify will already have been done
  484     // if we get to the next instruction via an exception
  485     //
  486     // n.b. adding this instruction here affects the calculation of
  487     // whether or not a routine returns to the call stub (used when
  488     // doing stack walks) since the normal test is to check the return
  489     // pc against the address saved below. so we may need to allow for
  490     // this extra instruction in the check.
  491 
  492     // save current address for use by exception handling code
  493 
  494     return_address = __ pc();
  495     entries.append(return_address);
  496 
  497     // store result depending on type (everything that is not
  498     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  499     // n.b. this assumes Java returns an integral result in r0
  500     // and a floating result in j_farg0
  501     // All of j_rargN may be used to return inline type fields so be careful
  502     // not to clobber those.
  503     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  504     // assignment of Rresult below.
  505     Register Rresult = r14, Rresult_type = r15;
  506     __ ldr(Rresult, result);
  507     Label is_long, is_float, is_double, check_prim, exit;
  508     __ ldr(Rresult_type, result_type);
  509     __ cmp(Rresult_type, (u1)T_OBJECT);
  510     __ br(Assembler::EQ, check_prim);
  511     __ cmp(Rresult_type, (u1)T_LONG);
  512     __ br(Assembler::EQ, is_long);
  513     __ cmp(Rresult_type, (u1)T_FLOAT);
  514     __ br(Assembler::EQ, is_float);
  515     __ cmp(Rresult_type, (u1)T_DOUBLE);
  516     __ br(Assembler::EQ, is_double);
  517 
  518     // handle T_INT case
  519     __ strw(r0, Address(Rresult));
  520 
  521     __ BIND(exit);
  522 
  523     // pop parameters
  524     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  525 
  526 #ifdef ASSERT
  527     // verify that threads correspond
  528     {
  529       Label L, S;
  530       __ ldr(rscratch1, thread);
  531       __ cmp(rthread, rscratch1);
  532       __ br(Assembler::NE, S);
  533       __ get_thread(rscratch1);
  534       __ cmp(rthread, rscratch1);
  535       __ br(Assembler::EQ, L);
  536       __ BIND(S);
  537       __ stop("StubRoutines::call_stub: threads must correspond");
  538       __ BIND(L);
  539     }
  540 #endif
  541 
  542     __ pop_cont_fastpath(rthread);
  543 
  544     // restore callee-save registers
  545     __ ldpd(v15, v14,  d15_save);
  546     __ ldpd(v13, v12,  d13_save);
  547     __ ldpd(v11, v10,  d11_save);
  548     __ ldpd(v9,  v8,   d9_save);
  549 
  550     __ ldp(r28, r27,   r28_save);
  551     __ ldp(r26, r25,   r26_save);
  552     __ ldp(r24, r23,   r24_save);
  553     __ ldp(r22, r21,   r22_save);
  554     __ ldp(r20, r19,   r20_save);
  555 
  556     // restore fpcr
  557     __ ldr(rscratch1,  fpcr_save);
  558     __ set_fpcr(rscratch1);
  559 
  560     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  561     __ ldrw(c_rarg2, result_type);
  562     __ ldr(c_rarg3,  method);
  563     __ ldp(c_rarg4, c_rarg5,  entry_point);
  564     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  565 
  566     // leave frame and return to caller
  567     __ leave();
  568     __ ret(lr);
  569 
  570     // handle return types different from T_INT
  571     __ BIND(check_prim);
  572     if (InlineTypeReturnedAsFields) {
  573       // Check for scalarized return value
  574       __ tbz(r0, 0, is_long);
  575       // Load pack handler address
  576       __ andr(rscratch1, r0, -2);
  577       __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
  578       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  579       __ blr(rscratch1);
  580       __ b(exit);
  581     }
  582 
  583     __ BIND(is_long);
  584     __ str(r0, Address(Rresult, 0));
  585     __ br(Assembler::AL, exit);
  586 
  587     __ BIND(is_float);
  588     __ strs(j_farg0, Address(Rresult, 0));
  589     __ br(Assembler::AL, exit);
  590 
  591     __ BIND(is_double);
  592     __ strd(j_farg0, Address(Rresult, 0));
  593     __ br(Assembler::AL, exit);
  594 
  595     // record the stub entry and end plus the auxiliary entry
  596     store_archive_data(stub_id, start, __ pc(), &entries);
  597 
  598     return start;
  599   }
  600 
  601   // Return point for a Java call if there's an exception thrown in
  602   // Java code.  The exception is caught and transformed into a
  603   // pending exception stored in JavaThread that can be tested from
  604   // within the VM.
  605   //
  606   // Note: Usually the parameters are removed by the callee. In case
  607   // of an exception crossing an activation frame boundary, that is
  608   // not the case if the callee is compiled code => need to setup the
  609   // rsp.
  610   //
  611   // r0: exception oop
  612 
  613   address generate_catch_exception() {
  614     StubId stub_id = StubId::stubgen_catch_exception_id;
  615     int entry_count = StubInfo::entry_count(stub_id);
  616     assert(entry_count == 1, "sanity check");
  617     address start = load_archive_data(stub_id);
  618     if (start != nullptr) {
  619       return start;
  620     }
  621     StubCodeMark mark(this, stub_id);
  622     start = __ pc();
  623 
  624     // same as in generate_call_stub():
  625     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  626     const Address thread        (rfp, thread_off         * wordSize);
  627 
  628 #ifdef ASSERT
  629     // verify that threads correspond
  630     {
  631       Label L, S;
  632       __ ldr(rscratch1, thread);
  633       __ cmp(rthread, rscratch1);
  634       __ br(Assembler::NE, S);
  635       __ get_thread(rscratch1);
  636       __ cmp(rthread, rscratch1);
  637       __ br(Assembler::EQ, L);
  638       __ bind(S);
  639       __ stop("StubRoutines::catch_exception: threads must correspond");
  640       __ bind(L);
  641     }
  642 #endif
  643 
  644     // set pending exception
  645     __ verify_oop(r0);
  646 
  647     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  648     // special case -- add file name string to AOT address table
  649     address file = (address)AOTCodeCache::add_C_string(__FILE__);
  650     __ lea(rscratch1, ExternalAddress(file));
  651     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  652     __ movw(rscratch1, (int)__LINE__);
  653     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  654 
  655     // complete return to VM
  656     assert(StubRoutines::_call_stub_return_address != nullptr,
  657            "_call_stub_return_address must have been generated before");
  658     __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
  659 
  660     // record the stub entry and end
  661     store_archive_data(stub_id, start, __ pc());
  662 
  663     return start;
  664   }
  665 
  666   // Continuation point for runtime calls returning with a pending
  667   // exception.  The pending exception check happened in the runtime
  668   // or native call stub.  The pending exception in Thread is
  669   // converted into a Java-level exception.
  670   //
  671   // Contract with Java-level exception handlers:
  672   // r0: exception
  673   // r3: throwing pc
  674   //
  675   // NOTE: At entry of this stub, exception-pc must be in LR !!
  676 
  677   // NOTE: this is always used as a jump target within generated code
  678   // so it just needs to be generated code with no x86 prolog
  679 
  680   address generate_forward_exception() {
  681     StubId stub_id = StubId::stubgen_forward_exception_id;
  682     int entry_count = StubInfo::entry_count(stub_id);
  683     assert(entry_count == 1, "sanity check");
  684     address start = load_archive_data(stub_id);
  685     if (start != nullptr) {
  686       return start;
  687     }
  688     StubCodeMark mark(this, stub_id);
  689     start = __ pc();
  690 
  691     // Upon entry, LR points to the return address returning into
  692     // Java (interpreted or compiled) code; i.e., the return address
  693     // becomes the throwing pc.
  694     //
  695     // Arguments pushed before the runtime call are still on the stack
  696     // but the exception handler will reset the stack pointer ->
  697     // ignore them.  A potential result in registers can be ignored as
  698     // well.
  699 
  700 #ifdef ASSERT
  701     // make sure this code is only executed if there is a pending exception
  702     {
  703       Label L;
  704       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  705       __ cbnz(rscratch1, L);
  706       __ stop("StubRoutines::forward exception: no pending exception (1)");
  707       __ bind(L);
  708     }
  709 #endif
  710 
  711     // compute exception handler into r19
  712 
  713     // call the VM to find the handler address associated with the
  714     // caller address. pass thread in r0 and caller pc (ret address)
  715     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  716     // the stack.
  717     __ mov(c_rarg1, lr);
  718     // lr will be trashed by the VM call so we move it to R19
  719     // (callee-saved) because we also need to pass it to the handler
  720     // returned by this call.
  721     __ mov(r19, lr);
  722     BLOCK_COMMENT("call exception_handler_for_return_address");
  723     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  724                          SharedRuntime::exception_handler_for_return_address),
  725                     rthread, c_rarg1);
  726     // Reinitialize the ptrue predicate register, in case the external runtime
  727     // call clobbers ptrue reg, as we may return to SVE compiled code.
  728     __ reinitialize_ptrue();
  729 
  730     // we should not really care that lr is no longer the callee
  731     // address. we saved the value the handler needs in r19 so we can
  732     // just copy it to r3. however, the C2 handler will push its own
  733     // frame and then calls into the VM and the VM code asserts that
  734     // the PC for the frame above the handler belongs to a compiled
  735     // Java method. So, we restore lr here to satisfy that assert.
  736     __ mov(lr, r19);
  737     // setup r0 & r3 & clear pending exception
  738     __ mov(r3, r19);
  739     __ mov(r19, r0);
  740     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  741     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  742 
  743 #ifdef ASSERT
  744     // make sure exception is set
  745     {
  746       Label L;
  747       __ cbnz(r0, L);
  748       __ stop("StubRoutines::forward exception: no pending exception (2)");
  749       __ bind(L);
  750     }
  751 #endif
  752 
  753     // continue at exception handler
  754     // r0: exception
  755     // r3: throwing pc
  756     // r19: exception handler
  757     __ verify_oop(r0);
  758     __ br(r19);
  759 
  760     // record the stub entry and end
  761     store_archive_data(stub_id, start, __ pc());
  762 
  763     return start;
  764   }
  765 
  766   // Non-destructive plausibility checks for oops
  767   //
  768   // Arguments:
  769   //    r0: oop to verify
  770   //    rscratch1: error message
  771   //
  772   // Stack after saving c_rarg3:
  773   //    [tos + 0]: saved c_rarg3
  774   //    [tos + 1]: saved c_rarg2
  775   //    [tos + 2]: saved lr
  776   //    [tos + 3]: saved rscratch2
  777   //    [tos + 4]: saved r0
  778   //    [tos + 5]: saved rscratch1
  779   address generate_verify_oop() {
  780     StubId stub_id = StubId::stubgen_verify_oop_id;
  781     int entry_count = StubInfo::entry_count(stub_id);
  782     assert(entry_count == 1, "sanity check");
  783     address start = load_archive_data(stub_id);
  784     if (start != nullptr) {
  785       return start;
  786     }
  787     StubCodeMark mark(this, stub_id);
  788     start = __ pc();
  789 
  790     Label exit, error;
  791 
  792     // save c_rarg2 and c_rarg3
  793     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  794 
  795     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  796     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  797     __ ldr(c_rarg3, Address(c_rarg2));
  798     __ add(c_rarg3, c_rarg3, 1);
  799     __ str(c_rarg3, Address(c_rarg2));
  800 
  801     // object is in r0
  802     // make sure object is 'reasonable'
  803     __ cbz(r0, exit); // if obj is null it is OK
  804 
  805     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  806     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  807 
  808     // return if everything seems ok
  809     __ bind(exit);
  810 
  811     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  812     __ ret(lr);
  813 
  814     // handle errors
  815     __ bind(error);
  816     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  817 
  818     __ push(RegSet::range(r0, r29), sp);
  819     // debug(char* msg, int64_t pc, int64_t regs[])
  820     __ mov(c_rarg0, rscratch1);      // pass address of error message
  821     __ mov(c_rarg1, lr);             // pass return address
  822     __ mov(c_rarg2, sp);             // pass address of regs on stack
  823 #ifndef PRODUCT
  824     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  825 #endif
  826     BLOCK_COMMENT("call MacroAssembler::debug");
  827     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  828     __ blr(rscratch1);
  829     __ hlt(0);
  830 
  831     // record the stub entry and end
  832     store_archive_data(stub_id, start, __ pc());
  833 
  834     return start;
  835   }
  836 
  837   // Generate indices for iota vector.
  838   void generate_iota_indices(StubId stub_id) {
  839     GrowableArray<address> entries;
  840     int entry_count = StubInfo::entry_count(stub_id);
  841     assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
  842     address start = load_archive_data(stub_id, &entries);
  843     if (start != nullptr) {
  844       assert(entries.length() == entry_count - 1,
  845              "unexpected entries count %d", entries.length());
  846       StubRoutines::aarch64::_vector_iota_indices[0] = start;
  847       for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  848         StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  849       }
  850       return;
  851     }
  852     __ align(CodeEntryAlignment);
  853     StubCodeMark mark(this, stub_id);
  854     start = __ pc();
  855     // B
  856     __ emit_data64(0x0706050403020100, relocInfo::none);
  857     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  858     entries.append(__ pc());
  859     // H
  860     __ emit_data64(0x0003000200010000, relocInfo::none);
  861     __ emit_data64(0x0007000600050004, relocInfo::none);
  862     entries.append(__ pc());
  863     // S
  864     __ emit_data64(0x0000000100000000, relocInfo::none);
  865     __ emit_data64(0x0000000300000002, relocInfo::none);
  866     entries.append(__ pc());
  867     // D
  868     __ emit_data64(0x0000000000000000, relocInfo::none);
  869     __ emit_data64(0x0000000000000001, relocInfo::none);
  870     entries.append(__ pc());
  871     // S - FP
  872     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  873     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  874     entries.append(__ pc());
  875     // D - FP
  876     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  877     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  878 
  879     // record the stub entry and end
  880     store_archive_data(stub_id, start, __ pc(), &entries);
  881 
  882     // install the entry addresses in the entry array
  883     assert(entries.length() == entry_count - 1,
  884            "unexpected entries count %d", entries.length());
  885     StubRoutines::aarch64::_vector_iota_indices[0] = start;
  886     for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  887       StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  888     }
  889   }
  890 
  891   // The inner part of zero_words().  This is the bulk operation,
  892   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  893   // caller is responsible for zeroing the last few words.
  894   //
  895   // Inputs:
  896   // r10: the HeapWord-aligned base address of an array to zero.
  897   // r11: the count in HeapWords, r11 > 0.
  898   //
  899   // Returns r10 and r11, adjusted for the caller to clear.
  900   // r10: the base address of the tail of words left to clear.
  901   // r11: the number of words in the tail.
  902   //      r11 < MacroAssembler::zero_words_block_size.
  903 
  904   address generate_zero_blocks() {
  905     StubId stub_id = StubId::stubgen_zero_blocks_id;
  906     int entry_count = StubInfo::entry_count(stub_id);
  907     assert(entry_count == 1, "sanity check");
  908     address start = load_archive_data(stub_id);
  909     if (start != nullptr) {
  910       return start;
  911     }
  912     __ align(CodeEntryAlignment);
  913     StubCodeMark mark(this, stub_id);
  914     Label done;
  915     Label base_aligned;
  916 
  917     Register base = r10, cnt = r11;
  918 
  919     start = __ pc();
  920 
  921     if (UseBlockZeroing) {
  922       int zva_length = VM_Version::zva_length();
  923 
  924       // Ensure ZVA length can be divided by 16. This is required by
  925       // the subsequent operations.
  926       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  927 
  928       __ tbz(base, 3, base_aligned);
  929       __ str(zr, Address(__ post(base, 8)));
  930       __ sub(cnt, cnt, 1);
  931       __ bind(base_aligned);
  932 
  933       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  934       // alignment.
  935       Label small;
  936       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  937       __ subs(rscratch1, cnt, low_limit >> 3);
  938       __ br(Assembler::LT, small);
  939       __ zero_dcache_blocks(base, cnt);
  940       __ bind(small);
  941     }
  942 
  943     {
  944       // Number of stp instructions we'll unroll
  945       const int unroll =
  946         MacroAssembler::zero_words_block_size / 2;
  947       // Clear the remaining blocks.
  948       Label loop;
  949       __ subs(cnt, cnt, unroll * 2);
  950       __ br(Assembler::LT, done);
  951       __ bind(loop);
  952       for (int i = 0; i < unroll; i++)
  953         __ stp(zr, zr, __ post(base, 16));
  954       __ subs(cnt, cnt, unroll * 2);
  955       __ br(Assembler::GE, loop);
  956       __ bind(done);
  957       __ add(cnt, cnt, unroll * 2);
  958     }
  959 
  960     __ ret(lr);
  961 
  962     // record the stub entry and end
  963     store_archive_data(stub_id, start, __ pc());
  964 
  965     return start;
  966   }
  967 
  968 
  969   typedef enum {
  970     copy_forwards = 1,
  971     copy_backwards = -1
  972   } copy_direction;
  973 
  974   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  975   // for arraycopy stubs.
  976   class ArrayCopyBarrierSetHelper : StackObj {
  977     BarrierSetAssembler* _bs_asm;
  978     MacroAssembler* _masm;
  979     DecoratorSet _decorators;
  980     BasicType _type;
  981     Register _gct1;
  982     Register _gct2;
  983     Register _gct3;
  984     FloatRegister _gcvt1;
  985     FloatRegister _gcvt2;
  986     FloatRegister _gcvt3;
  987 
  988   public:
  989     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  990                               DecoratorSet decorators,
  991                               BasicType type,
  992                               Register gct1,
  993                               Register gct2,
  994                               Register gct3,
  995                               FloatRegister gcvt1,
  996                               FloatRegister gcvt2,
  997                               FloatRegister gcvt3)
  998       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  999         _masm(masm),
 1000         _decorators(decorators),
 1001         _type(type),
 1002         _gct1(gct1),
 1003         _gct2(gct2),
 1004         _gct3(gct3),
 1005         _gcvt1(gcvt1),
 1006         _gcvt2(gcvt2),
 1007         _gcvt3(gcvt3) {
 1008     }
 1009 
 1010     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 1011       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 1012                             dst1, dst2, src,
 1013                             _gct1, _gct2, _gcvt1);
 1014     }
 1015 
 1016     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 1017       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 1018                              dst, src1, src2,
 1019                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 1020     }
 1021 
 1022     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 1023       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 1024                             dst1, dst2, src,
 1025                             _gct1);
 1026     }
 1027 
 1028     void copy_store_at_16(Address dst, Register src1, Register src2) {
 1029       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 1030                              dst, src1, src2,
 1031                              _gct1, _gct2, _gct3);
 1032     }
 1033 
 1034     void copy_load_at_8(Register dst, Address src) {
 1035       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 1036                             dst, noreg, src,
 1037                             _gct1);
 1038     }
 1039 
 1040     void copy_store_at_8(Address dst, Register src) {
 1041       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 1042                              dst, src, noreg,
 1043                              _gct1, _gct2, _gct3);
 1044     }
 1045   };
 1046 
 1047   // Bulk copy of blocks of 8 words.
 1048   //
 1049   // count is a count of words.
 1050   //
 1051   // Precondition: count >= 8
 1052   //
 1053   // Postconditions:
 1054   //
 1055   // The least significant bit of count contains the remaining count
 1056   // of words to copy.  The rest of count is trash.
 1057   //
 1058   // s and d are adjusted to point to the remaining words to copy
 1059   //
 1060   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
 1061     int entry_count = StubInfo::entry_count(stub_id);
 1062     assert(entry_count == 1, "sanity check");
 1063     address start = load_archive_data(stub_id);
 1064     if (start != nullptr) {
 1065       return start;
 1066     }
 1067     BasicType type;
 1068     copy_direction direction;
 1069 
 1070     switch (stub_id) {
 1071     case StubId::stubgen_copy_byte_f_id:
 1072       direction = copy_forwards;
 1073       type = T_BYTE;
 1074       break;
 1075     case StubId::stubgen_copy_byte_b_id:
 1076       direction = copy_backwards;
 1077       type = T_BYTE;
 1078       break;
 1079     case StubId::stubgen_copy_oop_f_id:
 1080       direction = copy_forwards;
 1081       type = T_OBJECT;
 1082       break;
 1083     case StubId::stubgen_copy_oop_b_id:
 1084       direction = copy_backwards;
 1085       type = T_OBJECT;
 1086       break;
 1087     case StubId::stubgen_copy_oop_uninit_f_id:
 1088       direction = copy_forwards;
 1089       type = T_OBJECT;
 1090       break;
 1091     case StubId::stubgen_copy_oop_uninit_b_id:
 1092       direction = copy_backwards;
 1093       type = T_OBJECT;
 1094       break;
 1095     default:
 1096       ShouldNotReachHere();
 1097     }
 1098 
 1099     int unit = wordSize * direction;
 1100     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 1101 
 1102     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 1103       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 1104     const Register stride = r14;
 1105     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1106     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1107     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1108 
 1109     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 1110     assert_different_registers(s, d, count, rscratch1, rscratch2);
 1111 
 1112     Label again, drain;
 1113 
 1114     __ align(CodeEntryAlignment);
 1115 
 1116     StubCodeMark mark(this, stub_id);
 1117 
 1118     start = __ pc();
 1119 
 1120     Label unaligned_copy_long;
 1121     if (AvoidUnalignedAccesses) {
 1122       __ tbnz(d, 3, unaligned_copy_long);
 1123     }
 1124 
 1125     if (direction == copy_forwards) {
 1126       __ sub(s, s, bias);
 1127       __ sub(d, d, bias);
 1128     }
 1129 
 1130 #ifdef ASSERT
 1131     // Make sure we are never given < 8 words
 1132     {
 1133       Label L;
 1134       __ cmp(count, (u1)8);
 1135       __ br(Assembler::GE, L);
 1136       __ stop("genrate_copy_longs called with < 8 words");
 1137       __ bind(L);
 1138     }
 1139 #endif
 1140 
 1141     // Fill 8 registers
 1142     if (UseSIMDForMemoryOps) {
 1143       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1144       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1145     } else {
 1146       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1147       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1148       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1149       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1150     }
 1151 
 1152     __ subs(count, count, 16);
 1153     __ br(Assembler::LO, drain);
 1154 
 1155     int prefetch = PrefetchCopyIntervalInBytes;
 1156     bool use_stride = false;
 1157     if (direction == copy_backwards) {
 1158       use_stride = prefetch > 256;
 1159       prefetch = -prefetch;
 1160       if (use_stride) __ mov(stride, prefetch);
 1161     }
 1162 
 1163     __ bind(again);
 1164 
 1165     if (PrefetchCopyIntervalInBytes > 0)
 1166       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1167 
 1168     if (UseSIMDForMemoryOps) {
 1169       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1170       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1171       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1172       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1173     } else {
 1174       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1175       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1176       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1177       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1178       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1179       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1180       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1181       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1182     }
 1183 
 1184     __ subs(count, count, 8);
 1185     __ br(Assembler::HS, again);
 1186 
 1187     // Drain
 1188     __ bind(drain);
 1189     if (UseSIMDForMemoryOps) {
 1190       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1191       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1192     } else {
 1193       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1194       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1195       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1196       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1197     }
 1198 
 1199     {
 1200       Label L1, L2;
 1201       __ tbz(count, exact_log2(4), L1);
 1202       if (UseSIMDForMemoryOps) {
 1203         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 1204         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 1205       } else {
 1206         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1207         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1208         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1209         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 1210       }
 1211       __ bind(L1);
 1212 
 1213       if (direction == copy_forwards) {
 1214         __ add(s, s, bias);
 1215         __ add(d, d, bias);
 1216       }
 1217 
 1218       __ tbz(count, 1, L2);
 1219       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 1220       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 1221       __ bind(L2);
 1222     }
 1223 
 1224     __ ret(lr);
 1225 
 1226     if (AvoidUnalignedAccesses) {
 1227       Label drain, again;
 1228       // Register order for storing. Order is different for backward copy.
 1229 
 1230       __ bind(unaligned_copy_long);
 1231 
 1232       // source address is even aligned, target odd aligned
 1233       //
 1234       // when forward copying word pairs we read long pairs at offsets
 1235       // {0, 2, 4, 6} (in long words). when backwards copying we read
 1236       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 1237       // address by -2 in the forwards case so we can compute the
 1238       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 1239       // or -1.
 1240       //
 1241       // when forward copying we need to store 1 word, 3 pairs and
 1242       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 1243       // zero offset We adjust the destination by -1 which means we
 1244       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1245       //
 1246       // When backwards copyng we need to store 1 word, 3 pairs and
 1247       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1248       // offsets {1, 3, 5, 7, 8} * unit.
 1249 
 1250       if (direction == copy_forwards) {
 1251         __ sub(s, s, 16);
 1252         __ sub(d, d, 8);
 1253       }
 1254 
 1255       // Fill 8 registers
 1256       //
 1257       // for forwards copy s was offset by -16 from the original input
 1258       // value of s so the register contents are at these offsets
 1259       // relative to the 64 bit block addressed by that original input
 1260       // and so on for each successive 64 byte block when s is updated
 1261       //
 1262       // t0 at offset 0,  t1 at offset 8
 1263       // t2 at offset 16, t3 at offset 24
 1264       // t4 at offset 32, t5 at offset 40
 1265       // t6 at offset 48, t7 at offset 56
 1266 
 1267       // for backwards copy s was not offset so the register contents
 1268       // are at these offsets into the preceding 64 byte block
 1269       // relative to that original input and so on for each successive
 1270       // preceding 64 byte block when s is updated. this explains the
 1271       // slightly counter-intuitive looking pattern of register usage
 1272       // in the stp instructions for backwards copy.
 1273       //
 1274       // t0 at offset -16, t1 at offset -8
 1275       // t2 at offset -32, t3 at offset -24
 1276       // t4 at offset -48, t5 at offset -40
 1277       // t6 at offset -64, t7 at offset -56
 1278 
 1279       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1280       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1281       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1282       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1283 
 1284       __ subs(count, count, 16);
 1285       __ br(Assembler::LO, drain);
 1286 
 1287       int prefetch = PrefetchCopyIntervalInBytes;
 1288       bool use_stride = false;
 1289       if (direction == copy_backwards) {
 1290         use_stride = prefetch > 256;
 1291         prefetch = -prefetch;
 1292         if (use_stride) __ mov(stride, prefetch);
 1293       }
 1294 
 1295       __ bind(again);
 1296 
 1297       if (PrefetchCopyIntervalInBytes > 0)
 1298         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1299 
 1300       if (direction == copy_forwards) {
 1301         // allowing for the offset of -8 the store instructions place
 1302         // registers into the target 64 bit block at the following
 1303         // offsets
 1304         //
 1305         // t0 at offset 0
 1306         // t1 at offset 8,  t2 at offset 16
 1307         // t3 at offset 24, t4 at offset 32
 1308         // t5 at offset 40, t6 at offset 48
 1309         // t7 at offset 56
 1310 
 1311         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1312         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1313         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1314         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1315         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1316         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1317         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1318         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1319         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1320       } else {
 1321         // d was not offset when we started so the registers are
 1322         // written into the 64 bit block preceding d with the following
 1323         // offsets
 1324         //
 1325         // t1 at offset -8
 1326         // t3 at offset -24, t0 at offset -16
 1327         // t5 at offset -48, t2 at offset -32
 1328         // t7 at offset -56, t4 at offset -48
 1329         //                   t6 at offset -64
 1330         //
 1331         // note that this matches the offsets previously noted for the
 1332         // loads
 1333 
 1334         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1335         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1336         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1337         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1338         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1339         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1340         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1341         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1342         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1343       }
 1344 
 1345       __ subs(count, count, 8);
 1346       __ br(Assembler::HS, again);
 1347 
 1348       // Drain
 1349       //
 1350       // this uses the same pattern of offsets and register arguments
 1351       // as above
 1352       __ bind(drain);
 1353       if (direction == copy_forwards) {
 1354         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1355         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1356         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1357         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1358         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1359       } else {
 1360         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1361         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1362         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1363         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1364         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1365       }
 1366       // now we need to copy any remaining part block which may
 1367       // include a 4 word block subblock and/or a 2 word subblock.
 1368       // bits 2 and 1 in the count are the tell-tale for whether we
 1369       // have each such subblock
 1370       {
 1371         Label L1, L2;
 1372         __ tbz(count, exact_log2(4), L1);
 1373         // this is the same as above but copying only 4 longs hence
 1374         // with only one intervening stp between the str instructions
 1375         // but note that the offsets and registers still follow the
 1376         // same pattern
 1377         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1378         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1379         if (direction == copy_forwards) {
 1380           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1381           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1382           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1383         } else {
 1384           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1385           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1386           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1387         }
 1388         __ bind(L1);
 1389 
 1390         __ tbz(count, 1, L2);
 1391         // this is the same as above but copying only 2 longs hence
 1392         // there is no intervening stp between the str instructions
 1393         // but note that the offset and register patterns are still
 1394         // the same
 1395         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1396         if (direction == copy_forwards) {
 1397           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1398           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1399         } else {
 1400           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1401           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1402         }
 1403         __ bind(L2);
 1404 
 1405         // for forwards copy we need to re-adjust the offsets we
 1406         // applied so that s and d are follow the last words written
 1407 
 1408         if (direction == copy_forwards) {
 1409           __ add(s, s, 16);
 1410           __ add(d, d, 8);
 1411         }
 1412 
 1413       }
 1414 
 1415       __ ret(lr);
 1416     }
 1417 
 1418     // record the stub entry and end
 1419     store_archive_data(stub_id, start, __ pc());
 1420 
 1421     return start;
 1422   }
 1423 
 1424   // Small copy: less than 16 bytes.
 1425   //
 1426   // NB: Ignores all of the bits of count which represent more than 15
 1427   // bytes, so a caller doesn't have to mask them.
 1428 
 1429   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1430     bool is_backwards = step < 0;
 1431     size_t granularity = g_uabs(step);
 1432     int direction = is_backwards ? -1 : 1;
 1433 
 1434     Label Lword, Lint, Lshort, Lbyte;
 1435 
 1436     assert(granularity
 1437            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1438 
 1439     const Register t0 = r3;
 1440     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1441     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1442 
 1443     // ??? I don't know if this bit-test-and-branch is the right thing
 1444     // to do.  It does a lot of jumping, resulting in several
 1445     // mispredicted branches.  It might make more sense to do this
 1446     // with something like Duff's device with a single computed branch.
 1447 
 1448     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1449     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1450     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1451     __ bind(Lword);
 1452 
 1453     if (granularity <= sizeof (jint)) {
 1454       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1455       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1456       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1457       __ bind(Lint);
 1458     }
 1459 
 1460     if (granularity <= sizeof (jshort)) {
 1461       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1462       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1463       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1464       __ bind(Lshort);
 1465     }
 1466 
 1467     if (granularity <= sizeof (jbyte)) {
 1468       __ tbz(count, 0, Lbyte);
 1469       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1470       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1471       __ bind(Lbyte);
 1472     }
 1473   }
 1474 
 1475   // All-singing all-dancing memory copy.
 1476   //
 1477   // Copy count units of memory from s to d.  The size of a unit is
 1478   // step, which can be positive or negative depending on the direction
 1479   // of copy.  If is_aligned is false, we align the source address.
 1480   //
 1481 
 1482   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1483                    Register s, Register d, Register count, int step) {
 1484     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1485     bool is_backwards = step < 0;
 1486     unsigned int granularity = g_uabs(step);
 1487     const Register t0 = r3, t1 = r4;
 1488 
 1489     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1490     // load all the data before writing anything
 1491     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1492     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1493     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1494     const Register send = r17, dend = r16;
 1495     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1496     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1497     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1498 
 1499     if (PrefetchCopyIntervalInBytes > 0)
 1500       __ prfm(Address(s, 0), PLDL1KEEP);
 1501     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1502     __ br(Assembler::HI, copy_big);
 1503 
 1504     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1505     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1506 
 1507     __ cmp(count, u1(16/granularity));
 1508     __ br(Assembler::LS, copy16);
 1509 
 1510     __ cmp(count, u1(64/granularity));
 1511     __ br(Assembler::HI, copy80);
 1512 
 1513     __ cmp(count, u1(32/granularity));
 1514     __ br(Assembler::LS, copy32);
 1515 
 1516     // 33..64 bytes
 1517     if (UseSIMDForMemoryOps) {
 1518       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1519       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1520       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1521       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1522     } else {
 1523       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1524       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1525       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1526       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1527 
 1528       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1529       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1530       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1531       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1532     }
 1533     __ b(finish);
 1534 
 1535     // 17..32 bytes
 1536     __ bind(copy32);
 1537     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1538     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1539 
 1540     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1541     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1542     __ b(finish);
 1543 
 1544     // 65..80/96 bytes
 1545     // (96 bytes if SIMD because we do 32 byes per instruction)
 1546     __ bind(copy80);
 1547     if (UseSIMDForMemoryOps) {
 1548       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1549       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1550       // Unaligned pointers can be an issue for copying.
 1551       // The issue has more chances to happen when granularity of data is
 1552       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1553       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1554       // The most performance drop has been seen for the range 65-80 bytes.
 1555       // For such cases using the pair of ldp/stp instead of the third pair of
 1556       // ldpq/stpq fixes the performance issue.
 1557       if (granularity < sizeof (jint)) {
 1558         Label copy96;
 1559         __ cmp(count, u1(80/granularity));
 1560         __ br(Assembler::HI, copy96);
 1561         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1562 
 1563         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1564         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1565 
 1566         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1567         __ b(finish);
 1568 
 1569         __ bind(copy96);
 1570       }
 1571       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1572 
 1573       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1574       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1575 
 1576       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1577     } else {
 1578       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1579       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1580       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1581       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1582       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1583 
 1584       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1585       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1586       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1587       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1588       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1589     }
 1590     __ b(finish);
 1591 
 1592     // 0..16 bytes
 1593     __ bind(copy16);
 1594     __ cmp(count, u1(8/granularity));
 1595     __ br(Assembler::LO, copy8);
 1596 
 1597     // 8..16 bytes
 1598     bs.copy_load_at_8(t0, Address(s, 0));
 1599     bs.copy_load_at_8(t1, Address(send, -8));
 1600     bs.copy_store_at_8(Address(d, 0), t0);
 1601     bs.copy_store_at_8(Address(dend, -8), t1);
 1602     __ b(finish);
 1603 
 1604     if (granularity < 8) {
 1605       // 4..7 bytes
 1606       __ bind(copy8);
 1607       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1608       __ ldrw(t0, Address(s, 0));
 1609       __ ldrw(t1, Address(send, -4));
 1610       __ strw(t0, Address(d, 0));
 1611       __ strw(t1, Address(dend, -4));
 1612       __ b(finish);
 1613       if (granularity < 4) {
 1614         // 0..3 bytes
 1615         __ bind(copy4);
 1616         __ cbz(count, finish); // get rid of 0 case
 1617         if (granularity == 2) {
 1618           __ ldrh(t0, Address(s, 0));
 1619           __ strh(t0, Address(d, 0));
 1620         } else { // granularity == 1
 1621           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1622           // the first and last byte.
 1623           // Handle the 3 byte case by loading and storing base + count/2
 1624           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1625           // This does means in the 1 byte case we load/store the same
 1626           // byte 3 times.
 1627           __ lsr(count, count, 1);
 1628           __ ldrb(t0, Address(s, 0));
 1629           __ ldrb(t1, Address(send, -1));
 1630           __ ldrb(t2, Address(s, count));
 1631           __ strb(t0, Address(d, 0));
 1632           __ strb(t1, Address(dend, -1));
 1633           __ strb(t2, Address(d, count));
 1634         }
 1635         __ b(finish);
 1636       }
 1637     }
 1638 
 1639     __ bind(copy_big);
 1640     if (is_backwards) {
 1641       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1642       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1643     }
 1644 
 1645     // Now we've got the small case out of the way we can align the
 1646     // source address on a 2-word boundary.
 1647 
 1648     // Here we will materialize a count in r15, which is used by copy_memory_small
 1649     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1650     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1651     // can not be used as a temp register, as it contains the count.
 1652 
 1653     Label aligned;
 1654 
 1655     if (is_aligned) {
 1656       // We may have to adjust by 1 word to get s 2-word-aligned.
 1657       __ tbz(s, exact_log2(wordSize), aligned);
 1658       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1659       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1660       __ sub(count, count, wordSize/granularity);
 1661     } else {
 1662       if (is_backwards) {
 1663         __ andr(r15, s, 2 * wordSize - 1);
 1664       } else {
 1665         __ neg(r15, s);
 1666         __ andr(r15, r15, 2 * wordSize - 1);
 1667       }
 1668       // r15 is the byte adjustment needed to align s.
 1669       __ cbz(r15, aligned);
 1670       int shift = exact_log2(granularity);
 1671       if (shift > 0) {
 1672         __ lsr(r15, r15, shift);
 1673       }
 1674       __ sub(count, count, r15);
 1675 
 1676 #if 0
 1677       // ?? This code is only correct for a disjoint copy.  It may or
 1678       // may not make sense to use it in that case.
 1679 
 1680       // Copy the first pair; s and d may not be aligned.
 1681       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1682       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1683 
 1684       // Align s and d, adjust count
 1685       if (is_backwards) {
 1686         __ sub(s, s, r15);
 1687         __ sub(d, d, r15);
 1688       } else {
 1689         __ add(s, s, r15);
 1690         __ add(d, d, r15);
 1691       }
 1692 #else
 1693       copy_memory_small(decorators, type, s, d, r15, step);
 1694 #endif
 1695     }
 1696 
 1697     __ bind(aligned);
 1698 
 1699     // s is now 2-word-aligned.
 1700 
 1701     // We have a count of units and some trailing bytes. Adjust the
 1702     // count and do a bulk copy of words. If the shift is zero
 1703     // perform a move instead to benefit from zero latency moves.
 1704     int shift = exact_log2(wordSize/granularity);
 1705     if (shift > 0) {
 1706       __ lsr(r15, count, shift);
 1707     } else {
 1708       __ mov(r15, count);
 1709     }
 1710     if (direction == copy_forwards) {
 1711       if (type != T_OBJECT) {
 1712         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
 1713         __ blr(rscratch1);
 1714       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1715         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
 1716         __ blr(rscratch1);
 1717       } else {
 1718         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
 1719         __ blr(rscratch1);
 1720       }
 1721     } else {
 1722       if (type != T_OBJECT) {
 1723         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
 1724         __ blr(rscratch1);
 1725       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1726         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
 1727         __ blr(rscratch1);
 1728       } else {
 1729         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
 1730         __ blr(rscratch1);
 1731       }
 1732     }
 1733 
 1734     // And the tail.
 1735     copy_memory_small(decorators, type, s, d, count, step);
 1736 
 1737     if (granularity >= 8) __ bind(copy8);
 1738     if (granularity >= 4) __ bind(copy4);
 1739     __ bind(finish);
 1740   }
 1741 
 1742 
 1743   void clobber_registers() {
 1744 #ifdef ASSERT
 1745     RegSet clobbered
 1746       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1747     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1748     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1749     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1750       __ mov(*it, rscratch1);
 1751     }
 1752 #endif
 1753 
 1754   }
 1755 
 1756   // Scan over array at a for count oops, verifying each one.
 1757   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1758   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1759     Label loop, end;
 1760     __ mov(rscratch1, a);
 1761     __ mov(rscratch2, zr);
 1762     __ bind(loop);
 1763     __ cmp(rscratch2, count);
 1764     __ br(Assembler::HS, end);
 1765     if (size == wordSize) {
 1766       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1767       __ verify_oop(temp);
 1768     } else {
 1769       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1770       __ decode_heap_oop(temp); // calls verify_oop
 1771     }
 1772     __ add(rscratch2, rscratch2, 1);
 1773     __ b(loop);
 1774     __ bind(end);
 1775   }
 1776 
 1777   // Arguments:
 1778   //   stub_id - is used to name the stub and identify all details of
 1779   //             how to perform the copy.
 1780   //
 1781   //   nopush_entry - is assigned to the stub's post push entry point
 1782   //                  unless it is null
 1783   //
 1784   // Inputs:
 1785   //   c_rarg0   - source array address
 1786   //   c_rarg1   - destination array address
 1787   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1788   //
 1789   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1790   // the hardware handle it.  The two dwords within qwords that span
 1791   // cache line boundaries will still be loaded and stored atomically.
 1792   //
 1793   // Side Effects: nopush_entry is set to the (post push) entry point
 1794   //               so it can be used by the corresponding conjoint
 1795   //               copy method
 1796   //
 1797   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1798     int size;
 1799     bool aligned;
 1800     bool is_oop;
 1801     bool dest_uninitialized;
 1802     switch (stub_id) {
 1803     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1804       size = sizeof(jbyte);
 1805       aligned = false;
 1806       is_oop = false;
 1807       dest_uninitialized = false;
 1808       break;
 1809     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1810       size = sizeof(jbyte);
 1811       aligned = true;
 1812       is_oop = false;
 1813       dest_uninitialized = false;
 1814       break;
 1815     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1816       size = sizeof(jshort);
 1817       aligned = false;
 1818       is_oop = false;
 1819       dest_uninitialized = false;
 1820       break;
 1821     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1822       size = sizeof(jshort);
 1823       aligned = true;
 1824       is_oop = false;
 1825       dest_uninitialized = false;
 1826       break;
 1827     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1828       size = sizeof(jint);
 1829       aligned = false;
 1830       is_oop = false;
 1831       dest_uninitialized = false;
 1832       break;
 1833     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1834       size = sizeof(jint);
 1835       aligned = true;
 1836       is_oop = false;
 1837       dest_uninitialized = false;
 1838       break;
 1839     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1840       // since this is always aligned we can (should!) use the same
 1841       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1842       ShouldNotReachHere();
 1843       break;
 1844     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1845       size = sizeof(jlong);
 1846       aligned = true;
 1847       is_oop = false;
 1848       dest_uninitialized = false;
 1849       break;
 1850     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1851       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1852       aligned = !UseCompressedOops;
 1853       is_oop = true;
 1854       dest_uninitialized = false;
 1855       break;
 1856     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1857       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1858       aligned = !UseCompressedOops;
 1859       is_oop = true;
 1860       dest_uninitialized = false;
 1861       break;
 1862     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1863       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1864       aligned = !UseCompressedOops;
 1865       is_oop = true;
 1866       dest_uninitialized = true;
 1867       break;
 1868     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1869       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1870       aligned = !UseCompressedOops;
 1871       is_oop = true;
 1872       dest_uninitialized = true;
 1873       break;
 1874     default:
 1875       ShouldNotReachHere();
 1876       break;
 1877     }
 1878     // all stubs provide a 2nd entry which omits the frame push for
 1879     // use when bailing out from a conjoint copy. However we may also
 1880     // need some extra addressses for memory access protection.
 1881     int entry_count = StubInfo::entry_count(stub_id);
 1882     assert(entry_count == 2, "sanity check");
 1883     assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
 1884 
 1885     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 1886     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 1887     GrowableArray<address> entries;
 1888     GrowableArray<address> extras;
 1889     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 1890     address start = load_archive_data(stub_id, &entries, extras_ptr);
 1891     if (start != nullptr) {
 1892       assert(entries.length() == entry_count - 1,
 1893              "unexpected entries count %d", entries.length());
 1894       *nopush_entry = entries.at(0);
 1895       assert(extras.length() == extra_count,
 1896              "unexpected extra count %d", extras.length());
 1897       if (add_extras) {
 1898         // register one handler at offset 0
 1899         register_unsafe_access_handlers(extras, 0, 1);
 1900       }
 1901       return start;
 1902     }
 1903 
 1904     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1905     RegSet saved_reg = RegSet::of(s, d, count);
 1906 
 1907     __ align(CodeEntryAlignment);
 1908     StubCodeMark mark(this, stub_id);
 1909     start = __ pc();
 1910     __ enter();
 1911 
 1912     *nopush_entry = __ pc();
 1913     entries.append(*nopush_entry);
 1914 
 1915     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1916     BLOCK_COMMENT("Post-Push Entry:");
 1917 
 1918     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1919     if (dest_uninitialized) {
 1920       decorators |= IS_DEST_UNINITIALIZED;
 1921     }
 1922     if (aligned) {
 1923       decorators |= ARRAYCOPY_ALIGNED;
 1924     }
 1925 
 1926     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1927     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1928 
 1929     if (is_oop) {
 1930       // save regs before copy_memory
 1931       __ push(RegSet::of(d, count), sp);
 1932     }
 1933     {
 1934       // UnsafeMemoryAccess page error: continue after unsafe access
 1935       UnsafeMemoryAccessMark umam(this, add_extras, true);
 1936       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1937     }
 1938 
 1939     if (is_oop) {
 1940       __ pop(RegSet::of(d, count), sp);
 1941       if (VerifyOops)
 1942         verify_oop_array(size, d, count, r16);
 1943     }
 1944 
 1945     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1946 
 1947     __ leave();
 1948     __ mov(r0, zr); // return 0
 1949     __ ret(lr);
 1950 
 1951     address end = __ pc();
 1952 
 1953     if (add_extras) {
 1954       // retrieve the registered handler addresses
 1955       retrieve_unsafe_access_handlers(start, end, extras);
 1956       assert(extras.length() == extra_count
 1957              , "incorrect handlers count %d", extras.length());
 1958     }
 1959 
 1960     // record the stub entry and end plus the no_push entry and any
 1961     // extra handler addresses
 1962     store_archive_data(stub_id, start, end, &entries, extras_ptr);
 1963 
 1964     return start;
 1965   }
 1966 
 1967   // Arguments:
 1968   //   stub_id - is used to name the stub and identify all details of
 1969   //             how to perform the copy.
 1970   //
 1971   //   nooverlap_target - identifes the (post push) entry for the
 1972   //             corresponding disjoint copy routine which can be
 1973   //             jumped to if the ranges do not actually overlap
 1974   //
 1975   //   nopush_entry - is assigned to the stub's post push entry point
 1976   //                  unless it is null
 1977   //
 1978   //
 1979   // Inputs:
 1980   //   c_rarg0   - source array address
 1981   //   c_rarg1   - destination array address
 1982   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1983   //
 1984   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1985   // the hardware handle it.  The two dwords within qwords that span
 1986   // cache line boundaries will still be loaded and stored atomically.
 1987   //
 1988   // Side Effects:
 1989   //   nopush_entry is set to the no-overlap entry point so it can be
 1990   //   used by some other conjoint copy method
 1991   //
 1992   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1993     int size;
 1994     bool aligned;
 1995     bool is_oop;
 1996     bool dest_uninitialized;
 1997     switch (stub_id) {
 1998     case StubId::stubgen_jbyte_arraycopy_id:
 1999       size = sizeof(jbyte);
 2000       aligned = false;
 2001       is_oop = false;
 2002       dest_uninitialized = false;
 2003       break;
 2004     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 2005       size = sizeof(jbyte);
 2006       aligned = true;
 2007       is_oop = false;
 2008       dest_uninitialized = false;
 2009       break;
 2010     case StubId::stubgen_jshort_arraycopy_id:
 2011       size = sizeof(jshort);
 2012       aligned = false;
 2013       is_oop = false;
 2014       dest_uninitialized = false;
 2015       break;
 2016     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 2017       size = sizeof(jshort);
 2018       aligned = true;
 2019       is_oop = false;
 2020       dest_uninitialized = false;
 2021       break;
 2022     case StubId::stubgen_jint_arraycopy_id:
 2023       size = sizeof(jint);
 2024       aligned = false;
 2025       is_oop = false;
 2026       dest_uninitialized = false;
 2027       break;
 2028     case StubId::stubgen_arrayof_jint_arraycopy_id:
 2029       size = sizeof(jint);
 2030       aligned = true;
 2031       is_oop = false;
 2032       dest_uninitialized = false;
 2033       break;
 2034     case StubId::stubgen_jlong_arraycopy_id:
 2035       // since this is always aligned we can (should!) use the same
 2036       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 2037       ShouldNotReachHere();
 2038       break;
 2039     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 2040       size = sizeof(jlong);
 2041       aligned = true;
 2042       is_oop = false;
 2043       dest_uninitialized = false;
 2044       break;
 2045     case StubId::stubgen_oop_arraycopy_id:
 2046       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2047       aligned = !UseCompressedOops;
 2048       is_oop = true;
 2049       dest_uninitialized = false;
 2050       break;
 2051     case StubId::stubgen_arrayof_oop_arraycopy_id:
 2052       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2053       aligned = !UseCompressedOops;
 2054       is_oop = true;
 2055       dest_uninitialized = false;
 2056       break;
 2057     case StubId::stubgen_oop_arraycopy_uninit_id:
 2058       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2059       aligned = !UseCompressedOops;
 2060       is_oop = true;
 2061       dest_uninitialized = true;
 2062       break;
 2063     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 2064       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2065       aligned = !UseCompressedOops;
 2066       is_oop = true;
 2067       dest_uninitialized = true;
 2068       break;
 2069     default:
 2070       ShouldNotReachHere();
 2071     }
 2072     // only some conjoint stubs generate a 2nd entry
 2073     int entry_count = StubInfo::entry_count(stub_id);
 2074     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2075     assert(entry_count == expected_entry_count,
 2076            "expected entry count %d does not match declared entry count %d for stub %s",
 2077            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2078 
 2079     // We need to protect memory accesses in certain cases
 2080     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 2081     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 2082     GrowableArray<address> entries;
 2083     GrowableArray<address> extras;
 2084     GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
 2085     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 2086     address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 2087     if (start != nullptr) {
 2088       assert(entries.length() == expected_entry_count - 1,
 2089              "unexpected entries count %d", entries.length());
 2090       assert(extras.length() == extra_count,
 2091              "unexpected extra count %d", extras.length());
 2092       if (nopush_entry != nullptr) {
 2093         *nopush_entry = entries.at(0);
 2094       }
 2095       if (add_extras) {
 2096         // register one handler at offset 0
 2097         register_unsafe_access_handlers(extras, 0, 1);
 2098       }
 2099       return start;
 2100     }
 2101 
 2102     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2103     RegSet saved_regs = RegSet::of(s, d, count);
 2104     StubCodeMark mark(this, stub_id);
 2105     start = __ pc();
 2106     __ enter();
 2107 
 2108     if (nopush_entry != nullptr) {
 2109       *nopush_entry = __ pc();
 2110       entries.append(*nopush_entry);
 2111       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 2112       BLOCK_COMMENT("Post-Push Entry:");
 2113     }
 2114 
 2115     // use fwd copy when (d-s) above_equal (count*size)
 2116     Label L_overlapping;
 2117     __ sub(rscratch1, d, s);
 2118     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 2119     __ br(Assembler::LO, L_overlapping);
 2120     __ b(RuntimeAddress(nooverlap_target));
 2121     __ bind(L_overlapping);
 2122 
 2123     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 2124     if (dest_uninitialized) {
 2125       decorators |= IS_DEST_UNINITIALIZED;
 2126     }
 2127     if (aligned) {
 2128       decorators |= ARRAYCOPY_ALIGNED;
 2129     }
 2130 
 2131     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2132     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 2133 
 2134     if (is_oop) {
 2135       // save regs before copy_memory
 2136       __ push(RegSet::of(d, count), sp);
 2137     }
 2138     {
 2139       // UnsafeMemoryAccess page error: continue after unsafe access
 2140       UnsafeMemoryAccessMark umam(this, add_extras, true);
 2141       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 2142     }
 2143     if (is_oop) {
 2144       __ pop(RegSet::of(d, count), sp);
 2145       if (VerifyOops)
 2146         verify_oop_array(size, d, count, r16);
 2147     }
 2148     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 2149     __ leave();
 2150     __ mov(r0, zr); // return 0
 2151     __ ret(lr);
 2152 
 2153     assert(entries.length() == expected_entry_count - 1,
 2154            "unexpected entries count %d", entries.length());
 2155 
 2156     address end = __ pc();
 2157 
 2158     if (add_extras) {
 2159       // retrieve the registered handler addresses
 2160       retrieve_unsafe_access_handlers(start, end, extras);
 2161       assert(extras.length() == extra_count,
 2162              "incorrect handlers count %d", extras.length());
 2163     }
 2164 
 2165     // record the stub entry and end plus any no_push entry and/or
 2166     // extra handler addresses
 2167     store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
 2168 
 2169     return start;
 2170   }
 2171 
 2172   // Helper for generating a dynamic type check.
 2173   // Smashes rscratch1, rscratch2.
 2174   void generate_type_check(Register sub_klass,
 2175                            Register super_check_offset,
 2176                            Register super_klass,
 2177                            Register temp1,
 2178                            Register temp2,
 2179                            Register result,
 2180                            Label& L_success) {
 2181     assert_different_registers(sub_klass, super_check_offset, super_klass);
 2182 
 2183     BLOCK_COMMENT("type_check:");
 2184 
 2185     Label L_miss;
 2186 
 2187     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 2188                                      super_check_offset);
 2189     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 2190 
 2191     // Fall through on failure!
 2192     __ BIND(L_miss);
 2193   }
 2194 
 2195   //
 2196   //  Generate checkcasting array copy stub
 2197   //
 2198   //  Input:
 2199   //    c_rarg0   - source array address
 2200   //    c_rarg1   - destination array address
 2201   //    c_rarg2   - element count, treated as ssize_t, can be zero
 2202   //    c_rarg3   - size_t ckoff (super_check_offset)
 2203   //    c_rarg4   - oop ckval (super_klass)
 2204   //
 2205   //  Output:
 2206   //    r0 ==  0  -  success
 2207   //    r0 == -1^K - failure, where K is partial transfer count
 2208   //
 2209   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 2210     bool dest_uninitialized;
 2211     switch (stub_id) {
 2212     case StubId::stubgen_checkcast_arraycopy_id:
 2213       dest_uninitialized = false;
 2214       break;
 2215     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 2216       dest_uninitialized = true;
 2217       break;
 2218     default:
 2219       ShouldNotReachHere();
 2220     }
 2221 
 2222     // The normal stub provides a 2nd entry which omits the frame push
 2223     // for use when bailing out from a disjoint copy.
 2224     // Only some conjoint stubs generate a 2nd entry
 2225     int entry_count = StubInfo::entry_count(stub_id);
 2226     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2227     GrowableArray<address> entries;
 2228     GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
 2229     assert(entry_count == expected_entry_count,
 2230            "expected entry count %d does not match declared entry count %d for stub %s",
 2231            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2232     address start = load_archive_data(stub_id, entries_ptr);
 2233     if (start != nullptr) {
 2234       assert(entries.length() + 1 == expected_entry_count,
 2235              "expected entry count %d does not match return entry count %d for stub %s",
 2236              expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
 2237       if (nopush_entry != nullptr) {
 2238         *nopush_entry = entries.at(0);
 2239       }
 2240       return start;
 2241     }
 2242 
 2243     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 2244 
 2245     // Input registers (after setup_arg_regs)
 2246     const Register from        = c_rarg0;   // source array address
 2247     const Register to          = c_rarg1;   // destination array address
 2248     const Register count       = c_rarg2;   // elementscount
 2249     const Register ckoff       = c_rarg3;   // super_check_offset
 2250     const Register ckval       = c_rarg4;   // super_klass
 2251 
 2252     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 2253 
 2254     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 2255     const Register copied_oop  = r22;       // actual oop copied
 2256     const Register count_save  = r21;       // orig elementscount
 2257     const Register start_to    = r20;       // destination array start address
 2258     const Register r19_klass   = r19;       // oop._klass
 2259 
 2260     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 2261     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 2262 
 2263     //---------------------------------------------------------------
 2264     // Assembler stub will be used for this call to arraycopy
 2265     // if the two arrays are subtypes of Object[] but the
 2266     // destination array type is not equal to or a supertype
 2267     // of the source type.  Each element must be separately
 2268     // checked.
 2269 
 2270     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 2271                                copied_oop, r19_klass, count_save);
 2272 
 2273     __ align(CodeEntryAlignment);
 2274     StubCodeMark mark(this, stub_id);
 2275     start = __ pc();
 2276 
 2277     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2278 
 2279 #ifdef ASSERT
 2280     // caller guarantees that the arrays really are different
 2281     // otherwise, we would have to make conjoint checks
 2282     { Label L;
 2283       __ b(L);                  // conjoint check not yet implemented
 2284       __ stop("checkcast_copy within a single array");
 2285       __ bind(L);
 2286     }
 2287 #endif //ASSERT
 2288 
 2289     // Caller of this entry point must set up the argument registers.
 2290     if (nopush_entry != nullptr) {
 2291       *nopush_entry = __ pc();
 2292       entries.append(*nopush_entry);
 2293       BLOCK_COMMENT("Entry:");
 2294     }
 2295 
 2296      // Empty array:  Nothing to do.
 2297     __ cbz(count, L_done);
 2298     __ push(RegSet::of(r19, r20, r21, r22), sp);
 2299 
 2300 #ifdef ASSERT
 2301     BLOCK_COMMENT("assert consistent ckoff/ckval");
 2302     // The ckoff and ckval must be mutually consistent,
 2303     // even though caller generates both.
 2304     { Label L;
 2305       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2306       __ ldrw(start_to, Address(ckval, sco_offset));
 2307       __ cmpw(ckoff, start_to);
 2308       __ br(Assembler::EQ, L);
 2309       __ stop("super_check_offset inconsistent");
 2310       __ bind(L);
 2311     }
 2312 #endif //ASSERT
 2313 
 2314     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 2315     bool is_oop = true;
 2316     int element_size = UseCompressedOops ? 4 : 8;
 2317     if (dest_uninitialized) {
 2318       decorators |= IS_DEST_UNINITIALIZED;
 2319     }
 2320 
 2321     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2322     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 2323 
 2324     // save the original count
 2325     __ mov(count_save, count);
 2326 
 2327     // Copy from low to high addresses
 2328     __ mov(start_to, to);              // Save destination array start address
 2329     __ b(L_load_element);
 2330 
 2331     // ======== begin loop ========
 2332     // (Loop is rotated; its entry is L_load_element.)
 2333     // Loop control:
 2334     //   for (; count != 0; count--) {
 2335     //     copied_oop = load_heap_oop(from++);
 2336     //     ... generate_type_check ...;
 2337     //     store_heap_oop(to++, copied_oop);
 2338     //   }
 2339     __ align(OptoLoopAlignment);
 2340 
 2341     __ BIND(L_store_element);
 2342     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 2343                       __ post(to, element_size), copied_oop, noreg,
 2344                       gct1, gct2, gct3);
 2345     __ sub(count, count, 1);
 2346     __ cbz(count, L_do_card_marks);
 2347 
 2348     // ======== loop entry is here ========
 2349     __ BIND(L_load_element);
 2350     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 2351                      copied_oop, noreg, __ post(from, element_size),
 2352                      gct1);
 2353     __ cbz(copied_oop, L_store_element);
 2354 
 2355     __ load_klass(r19_klass, copied_oop);// query the object klass
 2356 
 2357     BLOCK_COMMENT("type_check:");
 2358     generate_type_check(/*sub_klass*/r19_klass,
 2359                         /*super_check_offset*/ckoff,
 2360                         /*super_klass*/ckval,
 2361                         /*r_array_base*/gct1,
 2362                         /*temp2*/gct2,
 2363                         /*result*/r10, L_store_element);
 2364 
 2365     // Fall through on failure!
 2366 
 2367     // ======== end loop ========
 2368 
 2369     // It was a real error; we must depend on the caller to finish the job.
 2370     // Register count = remaining oops, count_orig = total oops.
 2371     // Emit GC store barriers for the oops we have copied and report
 2372     // their number to the caller.
 2373 
 2374     __ subs(count, count_save, count);     // K = partially copied oop count
 2375     __ eon(count, count, zr);              // report (-1^K) to caller
 2376     __ br(Assembler::EQ, L_done_pop);
 2377 
 2378     __ BIND(L_do_card_marks);
 2379     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2380 
 2381     __ bind(L_done_pop);
 2382     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2383     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2384 
 2385     __ bind(L_done);
 2386     __ mov(r0, count);
 2387     __ leave();
 2388     __ ret(lr);
 2389 
 2390     // record the stub entry and end plus any no_push entry
 2391     store_archive_data(stub_id, start, __ pc() , entries_ptr);
 2392     return start;
 2393   }
 2394 
 2395   // Perform range checks on the proposed arraycopy.
 2396   // Kills temp, but nothing else.
 2397   // Also, clean the sign bits of src_pos and dst_pos.
 2398   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2399                               Register src_pos, // source position (c_rarg1)
 2400                               Register dst,     // destination array oo (c_rarg2)
 2401                               Register dst_pos, // destination position (c_rarg3)
 2402                               Register length,
 2403                               Register temp,
 2404                               Label& L_failed) {
 2405     BLOCK_COMMENT("arraycopy_range_checks:");
 2406 
 2407     assert_different_registers(rscratch1, temp);
 2408 
 2409     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2410     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2411     __ addw(temp, length, src_pos);
 2412     __ cmpw(temp, rscratch1);
 2413     __ br(Assembler::HI, L_failed);
 2414 
 2415     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2416     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2417     __ addw(temp, length, dst_pos);
 2418     __ cmpw(temp, rscratch1);
 2419     __ br(Assembler::HI, L_failed);
 2420 
 2421     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2422     __ movw(src_pos, src_pos);
 2423     __ movw(dst_pos, dst_pos);
 2424 
 2425     BLOCK_COMMENT("arraycopy_range_checks done");
 2426   }
 2427 
 2428   // These stubs get called from some dumb test routine.
 2429   // I'll write them properly when they're called from
 2430   // something that's actually doing something.
 2431   static void fake_arraycopy_stub(address src, address dst, int count) {
 2432     assert(count == 0, "huh?");
 2433   }
 2434 
 2435 
 2436   //
 2437   //  Generate 'unsafe' array copy stub
 2438   //  Though just as safe as the other stubs, it takes an unscaled
 2439   //  size_t argument instead of an element count.
 2440   //
 2441   //  Input:
 2442   //    c_rarg0   - source array address
 2443   //    c_rarg1   - destination array address
 2444   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2445   //
 2446   // Examines the alignment of the operands and dispatches
 2447   // to a long, int, short, or byte copy loop.
 2448   //
 2449   address generate_unsafe_copy(address byte_copy_entry,
 2450                                address short_copy_entry,
 2451                                address int_copy_entry,
 2452                                address long_copy_entry) {
 2453     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2454     int entry_count = StubInfo::entry_count(stub_id);
 2455     assert(entry_count == 1, "sanity check");
 2456     address start = load_archive_data(stub_id);
 2457     if (start != nullptr) {
 2458       return start;
 2459     }
 2460     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2461     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2462 
 2463     __ align(CodeEntryAlignment);
 2464     StubCodeMark mark(this, stub_id);
 2465     start = __ pc();
 2466     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2467 
 2468     // bump this on entry, not on exit:
 2469     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2470 
 2471     __ orr(rscratch1, s, d);
 2472     __ orr(rscratch1, rscratch1, count);
 2473 
 2474     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2475     __ cbz(rscratch1, L_long_aligned);
 2476     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2477     __ cbz(rscratch1, L_int_aligned);
 2478     __ tbz(rscratch1, 0, L_short_aligned);
 2479     __ b(RuntimeAddress(byte_copy_entry));
 2480 
 2481     __ BIND(L_short_aligned);
 2482     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2483     __ b(RuntimeAddress(short_copy_entry));
 2484     __ BIND(L_int_aligned);
 2485     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2486     __ b(RuntimeAddress(int_copy_entry));
 2487     __ BIND(L_long_aligned);
 2488     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2489     __ b(RuntimeAddress(long_copy_entry));
 2490 
 2491     // record the stub entry and end
 2492     store_archive_data(stub_id, start, __ pc());
 2493 
 2494     return start;
 2495   }
 2496 
 2497   //
 2498   //  Generate generic array copy stubs
 2499   //
 2500   //  Input:
 2501   //    c_rarg0    -  src oop
 2502   //    c_rarg1    -  src_pos (32-bits)
 2503   //    c_rarg2    -  dst oop
 2504   //    c_rarg3    -  dst_pos (32-bits)
 2505   //    c_rarg4    -  element count (32-bits)
 2506   //
 2507   //  Output:
 2508   //    r0 ==  0  -  success
 2509   //    r0 == -1^K - failure, where K is partial transfer count
 2510   //
 2511   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2512                                 address int_copy_entry, address oop_copy_entry,
 2513                                 address long_copy_entry, address checkcast_copy_entry) {
 2514     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2515     int entry_count = StubInfo::entry_count(stub_id);
 2516     assert(entry_count == 1, "sanity check");
 2517     address start = load_archive_data(stub_id);
 2518     if (start != nullptr) {
 2519       return start;
 2520     }
 2521     Label L_failed, L_objArray;
 2522     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2523 
 2524     // Input registers
 2525     const Register src        = c_rarg0;  // source array oop
 2526     const Register src_pos    = c_rarg1;  // source position
 2527     const Register dst        = c_rarg2;  // destination array oop
 2528     const Register dst_pos    = c_rarg3;  // destination position
 2529     const Register length     = c_rarg4;
 2530 
 2531 
 2532     // Registers used as temps
 2533     const Register dst_klass  = c_rarg5;
 2534 
 2535     __ align(CodeEntryAlignment);
 2536 
 2537     StubCodeMark mark(this, stub_id);
 2538 
 2539     start = __ pc();
 2540 
 2541     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2542 
 2543     // bump this on entry, not on exit:
 2544     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2545 
 2546     //-----------------------------------------------------------------------
 2547     // Assembler stub will be used for this call to arraycopy
 2548     // if the following conditions are met:
 2549     //
 2550     // (1) src and dst must not be null.
 2551     // (2) src_pos must not be negative.
 2552     // (3) dst_pos must not be negative.
 2553     // (4) length  must not be negative.
 2554     // (5) src klass and dst klass should be the same and not null.
 2555     // (6) src and dst should be arrays.
 2556     // (7) src_pos + length must not exceed length of src.
 2557     // (8) dst_pos + length must not exceed length of dst.
 2558     //
 2559 
 2560     //  if (src == nullptr) return -1;
 2561     __ cbz(src, L_failed);
 2562 
 2563     //  if (src_pos < 0) return -1;
 2564     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2565 
 2566     //  if (dst == nullptr) return -1;
 2567     __ cbz(dst, L_failed);
 2568 
 2569     //  if (dst_pos < 0) return -1;
 2570     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2571 
 2572     // registers used as temp
 2573     const Register scratch_length    = r16; // elements count to copy
 2574     const Register scratch_src_klass = r17; // array klass
 2575     const Register lh                = r15; // layout helper
 2576 
 2577     //  if (length < 0) return -1;
 2578     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2579     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2580 
 2581     __ load_klass(scratch_src_klass, src);
 2582 #ifdef ASSERT
 2583     //  assert(src->klass() != nullptr);
 2584     {
 2585       BLOCK_COMMENT("assert klasses not null {");
 2586       Label L1, L2;
 2587       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2588       __ bind(L1);
 2589       __ stop("broken null klass");
 2590       __ bind(L2);
 2591       __ load_klass(rscratch1, dst);
 2592       __ cbz(rscratch1, L1);     // this would be broken also
 2593       BLOCK_COMMENT("} assert klasses not null done");
 2594     }
 2595 #endif
 2596 
 2597     // Load layout helper (32-bits)
 2598     //
 2599     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2600     // 32        30    24            16              8     2                 0
 2601     //
 2602     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2603     //
 2604 
 2605     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2606 
 2607     // Handle objArrays completely differently...
 2608     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2609     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2610     __ movw(rscratch1, objArray_lh);
 2611     __ eorw(rscratch2, lh, rscratch1);
 2612     __ cbzw(rscratch2, L_objArray);
 2613 
 2614     //  if (src->klass() != dst->klass()) return -1;
 2615     __ load_klass(rscratch2, dst);
 2616     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2617     __ cbnz(rscratch2, L_failed);
 2618 
 2619     // Check for flat inline type array -> return -1
 2620     __ test_flat_array_oop(src, rscratch2, L_failed);
 2621 
 2622     // Check for null-free (non-flat) inline type array -> handle as object array
 2623     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2624 
 2625     //  if (!src->is_Array()) return -1;
 2626     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2627 
 2628     // At this point, it is known to be a typeArray (array_tag 0x3).
 2629 #ifdef ASSERT
 2630     {
 2631       BLOCK_COMMENT("assert primitive array {");
 2632       Label L;
 2633       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2634       __ cmpw(lh, rscratch2);
 2635       __ br(Assembler::GE, L);
 2636       __ stop("must be a primitive array");
 2637       __ bind(L);
 2638       BLOCK_COMMENT("} assert primitive array done");
 2639     }
 2640 #endif
 2641 
 2642     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2643                            rscratch2, L_failed);
 2644 
 2645     // TypeArrayKlass
 2646     //
 2647     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2648     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2649     //
 2650 
 2651     const Register rscratch1_offset = rscratch1;    // array offset
 2652     const Register r15_elsize = lh; // element size
 2653 
 2654     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2655            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2656     __ add(src, src, rscratch1_offset);           // src array offset
 2657     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2658     BLOCK_COMMENT("choose copy loop based on element size");
 2659 
 2660     // next registers should be set before the jump to corresponding stub
 2661     const Register from     = c_rarg0;  // source array address
 2662     const Register to       = c_rarg1;  // destination array address
 2663     const Register count    = c_rarg2;  // elements count
 2664 
 2665     // 'from', 'to', 'count' registers should be set in such order
 2666     // since they are the same as 'src', 'src_pos', 'dst'.
 2667 
 2668     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2669 
 2670     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2671     // size in bytes).  We do a simple bitwise binary search.
 2672   __ BIND(L_copy_bytes);
 2673     __ tbnz(r15_elsize, 1, L_copy_ints);
 2674     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2675     __ lea(from, Address(src, src_pos));// src_addr
 2676     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2677     __ movw(count, scratch_length); // length
 2678     __ b(RuntimeAddress(byte_copy_entry));
 2679 
 2680   __ BIND(L_copy_shorts);
 2681     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2682     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2683     __ movw(count, scratch_length); // length
 2684     __ b(RuntimeAddress(short_copy_entry));
 2685 
 2686   __ BIND(L_copy_ints);
 2687     __ tbnz(r15_elsize, 0, L_copy_longs);
 2688     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2689     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2690     __ movw(count, scratch_length); // length
 2691     __ b(RuntimeAddress(int_copy_entry));
 2692 
 2693   __ BIND(L_copy_longs);
 2694 #ifdef ASSERT
 2695     {
 2696       BLOCK_COMMENT("assert long copy {");
 2697       Label L;
 2698       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2699       __ cmpw(r15_elsize, LogBytesPerLong);
 2700       __ br(Assembler::EQ, L);
 2701       __ stop("must be long copy, but elsize is wrong");
 2702       __ bind(L);
 2703       BLOCK_COMMENT("} assert long copy done");
 2704     }
 2705 #endif
 2706     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2707     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2708     __ movw(count, scratch_length); // length
 2709     __ b(RuntimeAddress(long_copy_entry));
 2710 
 2711     // ObjArrayKlass
 2712   __ BIND(L_objArray);
 2713     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2714 
 2715     Label L_plain_copy, L_checkcast_copy;
 2716     //  test array classes for subtyping
 2717     __ load_klass(r15, dst);
 2718     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2719     __ br(Assembler::NE, L_checkcast_copy);
 2720 
 2721     // Identically typed arrays can be copied without element-wise checks.
 2722     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2723                            rscratch2, L_failed);
 2724 
 2725     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2726     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2727     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2728     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2729     __ movw(count, scratch_length); // length
 2730   __ BIND(L_plain_copy);
 2731     __ b(RuntimeAddress(oop_copy_entry));
 2732 
 2733   __ BIND(L_checkcast_copy);
 2734     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2735     {
 2736       // Before looking at dst.length, make sure dst is also an objArray.
 2737       __ ldrw(rscratch1, Address(r15, lh_offset));
 2738       __ movw(rscratch2, objArray_lh);
 2739       __ eorw(rscratch1, rscratch1, rscratch2);
 2740       __ cbnzw(rscratch1, L_failed);
 2741 
 2742       // It is safe to examine both src.length and dst.length.
 2743       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2744                              r15, L_failed);
 2745 
 2746       __ load_klass(dst_klass, dst); // reload
 2747 
 2748       // Marshal the base address arguments now, freeing registers.
 2749       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2750       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2751       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2752       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2753       __ movw(count, length);           // length (reloaded)
 2754       Register sco_temp = c_rarg3;      // this register is free now
 2755       assert_different_registers(from, to, count, sco_temp,
 2756                                  dst_klass, scratch_src_klass);
 2757       // assert_clean_int(count, sco_temp);
 2758 
 2759       // Generate the type check.
 2760       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2761       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2762 
 2763       // Smashes rscratch1, rscratch2
 2764       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2765                           L_plain_copy);
 2766 
 2767       // Fetch destination element klass from the ObjArrayKlass header.
 2768       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2769       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2770       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2771 
 2772       // the checkcast_copy loop needs two extra arguments:
 2773       assert(c_rarg3 == sco_temp, "#3 already in place");
 2774       // Set up arguments for checkcast_copy_entry.
 2775       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2776       __ b(RuntimeAddress(checkcast_copy_entry));
 2777     }
 2778 
 2779   __ BIND(L_failed);
 2780     __ mov(r0, -1);
 2781     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2782     __ ret(lr);
 2783 
 2784     // record the stub entry and end
 2785     store_archive_data(stub_id, start, __ pc());
 2786 
 2787     return start;
 2788   }
 2789 
 2790   //
 2791   // Generate stub for array fill. If "aligned" is true, the
 2792   // "to" address is assumed to be heapword aligned.
 2793   //
 2794   // Arguments for generated stub:
 2795   //   to:    c_rarg0
 2796   //   value: c_rarg1
 2797   //   count: c_rarg2 treated as signed
 2798   //
 2799   address generate_fill(StubId stub_id) {
 2800     BasicType t;
 2801     bool aligned;
 2802 
 2803     switch (stub_id) {
 2804     case StubId::stubgen_jbyte_fill_id:
 2805       t = T_BYTE;
 2806       aligned = false;
 2807       break;
 2808     case StubId::stubgen_jshort_fill_id:
 2809       t = T_SHORT;
 2810       aligned = false;
 2811       break;
 2812     case StubId::stubgen_jint_fill_id:
 2813       t = T_INT;
 2814       aligned = false;
 2815       break;
 2816     case StubId::stubgen_arrayof_jbyte_fill_id:
 2817       t = T_BYTE;
 2818       aligned = true;
 2819       break;
 2820     case StubId::stubgen_arrayof_jshort_fill_id:
 2821       t = T_SHORT;
 2822       aligned = true;
 2823       break;
 2824     case StubId::stubgen_arrayof_jint_fill_id:
 2825       t = T_INT;
 2826       aligned = true;
 2827       break;
 2828     default:
 2829       ShouldNotReachHere();
 2830     };
 2831     int entry_count = StubInfo::entry_count(stub_id);
 2832     assert(entry_count == 1, "sanity check");
 2833     address start = load_archive_data(stub_id);
 2834     if (start != nullptr) {
 2835       return start;
 2836     }
 2837     __ align(CodeEntryAlignment);
 2838     StubCodeMark mark(this, stub_id);
 2839     start = __ pc();
 2840 
 2841     BLOCK_COMMENT("Entry:");
 2842 
 2843     const Register to        = c_rarg0;  // source array address
 2844     const Register value     = c_rarg1;  // value
 2845     const Register count     = c_rarg2;  // elements count
 2846 
 2847     const Register bz_base = r10;        // base for block_zero routine
 2848     const Register cnt_words = r11;      // temp register
 2849 
 2850     __ enter();
 2851 
 2852     Label L_fill_elements, L_exit1;
 2853 
 2854     int shift = -1;
 2855     switch (t) {
 2856       case T_BYTE:
 2857         shift = 0;
 2858         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2859         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2860         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2861         __ br(Assembler::LO, L_fill_elements);
 2862         break;
 2863       case T_SHORT:
 2864         shift = 1;
 2865         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2866         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2867         __ br(Assembler::LO, L_fill_elements);
 2868         break;
 2869       case T_INT:
 2870         shift = 2;
 2871         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2872         __ br(Assembler::LO, L_fill_elements);
 2873         break;
 2874       default: ShouldNotReachHere();
 2875     }
 2876 
 2877     // Align source address at 8 bytes address boundary.
 2878     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2879     if (!aligned) {
 2880       switch (t) {
 2881         case T_BYTE:
 2882           // One byte misalignment happens only for byte arrays.
 2883           __ tbz(to, 0, L_skip_align1);
 2884           __ strb(value, Address(__ post(to, 1)));
 2885           __ subw(count, count, 1);
 2886           __ bind(L_skip_align1);
 2887           // Fallthrough
 2888         case T_SHORT:
 2889           // Two bytes misalignment happens only for byte and short (char) arrays.
 2890           __ tbz(to, 1, L_skip_align2);
 2891           __ strh(value, Address(__ post(to, 2)));
 2892           __ subw(count, count, 2 >> shift);
 2893           __ bind(L_skip_align2);
 2894           // Fallthrough
 2895         case T_INT:
 2896           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2897           __ tbz(to, 2, L_skip_align4);
 2898           __ strw(value, Address(__ post(to, 4)));
 2899           __ subw(count, count, 4 >> shift);
 2900           __ bind(L_skip_align4);
 2901           break;
 2902         default: ShouldNotReachHere();
 2903       }
 2904     }
 2905 
 2906     //
 2907     //  Fill large chunks
 2908     //
 2909     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2910     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2911     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2912     if (UseBlockZeroing) {
 2913       Label non_block_zeroing, rest;
 2914       // If the fill value is zero we can use the fast zero_words().
 2915       __ cbnz(value, non_block_zeroing);
 2916       __ mov(bz_base, to);
 2917       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2918       address tpc = __ zero_words(bz_base, cnt_words);
 2919       if (tpc == nullptr) {
 2920         fatal("CodeCache is full at generate_fill");
 2921       }
 2922       __ b(rest);
 2923       __ bind(non_block_zeroing);
 2924       __ fill_words(to, cnt_words, value);
 2925       __ bind(rest);
 2926     } else {
 2927       __ fill_words(to, cnt_words, value);
 2928     }
 2929 
 2930     // Remaining count is less than 8 bytes. Fill it by a single store.
 2931     // Note that the total length is no less than 8 bytes.
 2932     if (t == T_BYTE || t == T_SHORT) {
 2933       Label L_exit1;
 2934       __ cbzw(count, L_exit1);
 2935       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2936       __ str(value, Address(to, -8));    // overwrite some elements
 2937       __ bind(L_exit1);
 2938       __ leave();
 2939       __ ret(lr);
 2940     }
 2941 
 2942     // Handle copies less than 8 bytes.
 2943     Label L_fill_2, L_fill_4, L_exit2;
 2944     __ bind(L_fill_elements);
 2945     switch (t) {
 2946       case T_BYTE:
 2947         __ tbz(count, 0, L_fill_2);
 2948         __ strb(value, Address(__ post(to, 1)));
 2949         __ bind(L_fill_2);
 2950         __ tbz(count, 1, L_fill_4);
 2951         __ strh(value, Address(__ post(to, 2)));
 2952         __ bind(L_fill_4);
 2953         __ tbz(count, 2, L_exit2);
 2954         __ strw(value, Address(to));
 2955         break;
 2956       case T_SHORT:
 2957         __ tbz(count, 0, L_fill_4);
 2958         __ strh(value, Address(__ post(to, 2)));
 2959         __ bind(L_fill_4);
 2960         __ tbz(count, 1, L_exit2);
 2961         __ strw(value, Address(to));
 2962         break;
 2963       case T_INT:
 2964         __ cbzw(count, L_exit2);
 2965         __ strw(value, Address(to));
 2966         break;
 2967       default: ShouldNotReachHere();
 2968     }
 2969     __ bind(L_exit2);
 2970     __ leave();
 2971     __ ret(lr);
 2972 
 2973     // record the stub entry and end
 2974     store_archive_data(stub_id, start, __ pc());
 2975 
 2976     return start;
 2977   }
 2978 
 2979   address generate_unsafecopy_common_error_exit() {
 2980     StubId stub_id = StubId::stubgen_unsafecopy_common_id;
 2981     int entry_count = StubInfo::entry_count(stub_id);
 2982     assert(entry_count == 1, "sanity check");
 2983     address start = load_archive_data(stub_id);
 2984     if (start != nullptr) {
 2985       return start;
 2986     }
 2987     __ align(CodeEntryAlignment);
 2988     StubCodeMark mark(this, stub_id);
 2989     start = __ pc();
 2990       __ leave();
 2991       __ mov(r0, 0);
 2992       __ ret(lr);
 2993 
 2994     // record the stub entry and end
 2995     store_archive_data(stub_id, start, __ pc());
 2996 
 2997     return start;
 2998   }
 2999 
 3000   //
 3001   //  Generate 'unsafe' set memory stub
 3002   //  Though just as safe as the other stubs, it takes an unscaled
 3003   //  size_t (# bytes) argument instead of an element count.
 3004   //
 3005   //  This fill operation is atomicity preserving: as long as the
 3006   //  address supplied is sufficiently aligned, all writes of up to 64
 3007   //  bits in size are single-copy atomic.
 3008   //
 3009   //  Input:
 3010   //    c_rarg0   - destination array address
 3011   //    c_rarg1   - byte count (size_t)
 3012   //    c_rarg2   - byte value
 3013   //
 3014   address generate_unsafe_setmemory() {
 3015     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
 3016     int entry_count = StubInfo::entry_count(stub_id);
 3017     assert(entry_count == 1, "sanity check");
 3018     // we expect one set of extra unsafememory access handler entries
 3019     GrowableArray<address> extras;
 3020     int extra_count =  1 * UnsafeMemoryAccess::COLUMN_COUNT;
 3021     address start = load_archive_data(stub_id, nullptr, &extras);
 3022     if (start != nullptr) {
 3023       assert(extras.length() == extra_count,
 3024              "unexpected extra entry count %d", extras.length());
 3025       register_unsafe_access_handlers(extras, 0, 1);
 3026       return start;
 3027     }
 3028 
 3029     __ align(CodeEntryAlignment);
 3030     StubCodeMark mark(this, stub_id);
 3031     start = __ pc();
 3032 
 3033     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 3034     Label tail;
 3035 
 3036     {
 3037     UnsafeMemoryAccessMark umam(this, true, false);
 3038 
 3039     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3040 
 3041     __ dup(v0, __ T16B, value);
 3042 
 3043     if (AvoidUnalignedAccesses) {
 3044       __ cmp(count, (u1)16);
 3045       __ br(__ LO, tail);
 3046 
 3047       __ mov(rscratch1, 16);
 3048       __ andr(rscratch2, dest, 15);
 3049       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 3050       __ strq(v0, Address(dest));
 3051       __ sub(count, count, rscratch1);
 3052       __ add(dest, dest, rscratch1);
 3053     }
 3054 
 3055     __ subs(count, count, (u1)64);
 3056     __ br(__ LO, tail);
 3057     {
 3058       Label again;
 3059       __ bind(again);
 3060       __ stpq(v0, v0, Address(dest));
 3061       __ stpq(v0, v0, Address(dest, 32));
 3062 
 3063       __ subs(count, count, 64);
 3064       __ add(dest, dest, 64);
 3065       __ br(__ HS, again);
 3066     }
 3067 
 3068     __ bind(tail);
 3069     // The count of bytes is off by 64, but we don't need to correct
 3070     // it because we're only going to use the least-significant few
 3071     // count bits from here on.
 3072     // __ add(count, count, 64);
 3073 
 3074     {
 3075       Label dont;
 3076       __ tbz(count, exact_log2(32), dont);
 3077       __ stpq(v0, v0, __ post(dest, 32));
 3078       __ bind(dont);
 3079     }
 3080     {
 3081       Label dont;
 3082       __ tbz(count, exact_log2(16), dont);
 3083       __ strq(v0, __ post(dest, 16));
 3084       __ bind(dont);
 3085     }
 3086     {
 3087       Label dont;
 3088       __ tbz(count, exact_log2(8), dont);
 3089       __ strd(v0, __ post(dest, 8));
 3090       __ bind(dont);
 3091     }
 3092 
 3093     Label finished;
 3094     __ tst(count, 7);
 3095     __ br(__ EQ, finished);
 3096 
 3097     {
 3098       Label dont;
 3099       __ tbz(count, exact_log2(4), dont);
 3100       __ strs(v0, __ post(dest, 4));
 3101       __ bind(dont);
 3102     }
 3103     {
 3104       Label dont;
 3105       __ tbz(count, exact_log2(2), dont);
 3106       __ bfi(value, value, 8, 8);
 3107       __ strh(value, __ post(dest, 2));
 3108       __ bind(dont);
 3109     }
 3110     {
 3111       Label dont;
 3112       __ tbz(count, exact_log2(1), dont);
 3113       __ strb(value, Address(dest));
 3114       __ bind(dont);
 3115     }
 3116 
 3117     __ bind(finished);
 3118     __ leave();
 3119     __ ret(lr);
 3120     // have to exit the block and destroy the UnsafeMemoryAccessMark
 3121     // in order to retrieve the handler end address
 3122     }
 3123 
 3124     // install saved handler addresses in extras
 3125     address end = __ pc();
 3126     retrieve_unsafe_access_handlers(start, end, extras);
 3127     assert(extras.length() == extra_count,
 3128            "incorrect handlers count %d", extras.length());
 3129     // record the stub entry and end plus the extras
 3130     store_archive_data(stub_id, start, end, nullptr, &extras);
 3131 
 3132     return start;
 3133   }
 3134 
 3135   address generate_data_cache_writeback() {
 3136     const Register line        = c_rarg0;  // address of line to write back
 3137 
 3138     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 3139     int entry_count = StubInfo::entry_count(stub_id);
 3140     assert(entry_count == 1, "sanity check");
 3141     address start = load_archive_data(stub_id);
 3142     if (start != nullptr) {
 3143       return start;
 3144     }
 3145     __ align(CodeEntryAlignment);
 3146     StubCodeMark mark(this, stub_id);
 3147 
 3148     start = __ pc();
 3149     __ enter();
 3150     __ cache_wb(Address(line, 0));
 3151     __ leave();
 3152     __ ret(lr);
 3153 
 3154     // record the stub entry and end
 3155     store_archive_data(stub_id, start, __ pc());
 3156 
 3157     return start;
 3158   }
 3159 
 3160   address generate_data_cache_writeback_sync() {
 3161     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 3162     int entry_count = StubInfo::entry_count(stub_id);
 3163     assert(entry_count == 1, "sanity check");
 3164     address start = load_archive_data(stub_id);
 3165     if (start != nullptr) {
 3166       return start;
 3167     }
 3168     const Register is_pre     = c_rarg0;  // pre or post sync
 3169     __ align(CodeEntryAlignment);
 3170     StubCodeMark mark(this, stub_id);
 3171 
 3172     // pre wbsync is a no-op
 3173     // post wbsync translates to an sfence
 3174 
 3175     Label skip;
 3176     start = __ pc();
 3177     __ enter();
 3178     __ cbnz(is_pre, skip);
 3179     __ cache_wbsync(false);
 3180     __ bind(skip);
 3181     __ leave();
 3182     __ ret(lr);
 3183 
 3184     // record the stub entry and end
 3185     store_archive_data(stub_id, start, __ pc());
 3186 
 3187     return start;
 3188   }
 3189 
 3190   void generate_arraycopy_stubs() {
 3191     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 3192     // entry immediately following their stack push. This can be used
 3193     // as a post-push branch target for compatible stubs when they
 3194     // identify a special case that can be handled by the fallback
 3195     // stub e.g a disjoint copy stub may be use as a special case
 3196     // fallback for its compatible conjoint copy stub.
 3197     //
 3198     // A no push entry is always returned in the following local and
 3199     // then published by assigning to the appropriate entry field in
 3200     // class StubRoutines. The entry value is then passed to the
 3201     // generator for the compatible stub. That means the entry must be
 3202     // listed when saving to/restoring from the AOT cache, ensuring
 3203     // that the inter-stub jumps are noted at AOT-cache save and
 3204     // relocated at AOT cache load.
 3205     address nopush_entry;
 3206 
 3207     // generate the common exit first so later stubs can rely on it if
 3208     // they want an UnsafeMemoryAccess exit non-local to the stub
 3209     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 3210     // register the stub as the default exit with class UnsafeMemoryAccess
 3211     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 3212 
 3213     // generate and publish arch64-specific bulk copy routines first
 3214     // so we can call them from other copy stubs
 3215     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3216     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3217 
 3218     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3219     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3220 
 3221     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3222     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3223 
 3224     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 3225 
 3226     //*** jbyte
 3227     // Always need aligned and unaligned versions
 3228     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3229     // disjoint nopush entry is needed by conjoint copy
 3230     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3231     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 3232     // conjoint nopush entry is needed by generic/unsafe copy
 3233     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 3234     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3235     // disjoint arrayof nopush entry is needed by conjoint copy
 3236     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3237     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 3238 
 3239     //*** jshort
 3240     // Always need aligned and unaligned versions
 3241     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 3242     // disjoint nopush entry is needed by conjoint copy
 3243     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 3244     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 3245     // conjoint nopush entry is used by generic/unsafe copy
 3246     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 3247     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 3248     // disjoint arrayof nopush entry is needed by conjoint copy
 3249     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 3250     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 3251 
 3252     //*** jint
 3253     // Aligned versions
 3254     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 3255     // disjoint arrayof nopush entry is needed by conjoint copy
 3256     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 3257     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 3258     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 3259     // jint_arraycopy_nopush always points to the unaligned version
 3260     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 3261     // disjoint nopush entry is needed by conjoint copy
 3262     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 3263     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 3264     // conjoint nopush entry is needed by generic/unsafe copy
 3265     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 3266 
 3267     //*** jlong
 3268     // It is always aligned
 3269     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 3270     // disjoint arrayof nopush entry is needed by conjoint copy
 3271     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 3272     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 3273     // conjoint nopush entry is needed by generic/unsafe copy
 3274     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 3275     // disjoint normal/nopush and conjoint normal entries are not
 3276     // generated since the arrayof versions are the same
 3277     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 3278     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 3279     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 3280 
 3281     //*** oops
 3282     {
 3283       StubRoutines::_arrayof_oop_disjoint_arraycopy
 3284         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 3285       // disjoint arrayof nopush entry is needed by conjoint copy
 3286       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 3287       StubRoutines::_arrayof_oop_arraycopy
 3288         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 3289       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 3290       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 3291       // Aligned versions without pre-barriers
 3292       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 3293         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 3294       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 3295       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 3296       // note that we don't need a returned nopush entry because the
 3297       // generic/unsafe copy does not cater for uninit arrays.
 3298       StubRoutines::_arrayof_oop_arraycopy_uninit
 3299         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 3300     }
 3301 
 3302     // for oop copies reuse arrayof entries for non-arrayof cases
 3303     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 3304     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 3305     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 3306     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 3307     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 3308     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 3309 
 3310     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 3311     // checkcast nopush entry is needed by generic copy
 3312     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 3313     // note that we don't need a returned nopush entry because the
 3314     // generic copy does not cater for uninit arrays.
 3315     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 3316 
 3317     // unsafe arraycopy may fallback on conjoint stubs
 3318     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3319                                                               StubRoutines::_jshort_arraycopy_nopush,
 3320                                                               StubRoutines::_jint_arraycopy_nopush,
 3321                                                               StubRoutines::_jlong_arraycopy_nopush);
 3322 
 3323     // generic arraycopy may fallback on conjoint stubs
 3324     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3325                                                                StubRoutines::_jshort_arraycopy_nopush,
 3326                                                                StubRoutines::_jint_arraycopy_nopush,
 3327                                                                StubRoutines::_oop_arraycopy_nopush,
 3328                                                                StubRoutines::_jlong_arraycopy_nopush,
 3329                                                                StubRoutines::_checkcast_arraycopy_nopush);
 3330 
 3331     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 3332     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 3333     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 3334     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 3335     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 3336     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 3337   }
 3338 
 3339   void generate_math_stubs() { Unimplemented(); }
 3340 
 3341   // Arguments:
 3342   //
 3343   // Inputs:
 3344   //   c_rarg0   - source byte array address
 3345   //   c_rarg1   - destination byte array address
 3346   //   c_rarg2   - sessionKe (key) in little endian int array
 3347   //
 3348   address generate_aescrypt_encryptBlock() {
 3349     assert(UseAES, "need AES cryptographic extension support");
 3350     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 3351     int entry_count = StubInfo::entry_count(stub_id);
 3352     assert(entry_count == 1, "sanity check");
 3353     address start = load_archive_data(stub_id);
 3354     if (start != nullptr) {
 3355       return start;
 3356     }
 3357     __ align(CodeEntryAlignment);
 3358     StubCodeMark mark(this, stub_id);
 3359 
 3360     const Register from        = c_rarg0;  // source array address
 3361     const Register to          = c_rarg1;  // destination array address
 3362     const Register key         = c_rarg2;  // key array address
 3363     const Register keylen      = rscratch1;
 3364 
 3365     start = __ pc();
 3366     __ enter();
 3367 
 3368     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3369 
 3370     __ aesenc_loadkeys(key, keylen);
 3371     __ aesecb_encrypt(from, to, keylen);
 3372 
 3373     __ mov(r0, 0);
 3374 
 3375     __ leave();
 3376     __ ret(lr);
 3377 
 3378     // record the stub entry and end
 3379     store_archive_data(stub_id, start, __ pc());
 3380 
 3381     return start;
 3382   }
 3383 
 3384   // Arguments:
 3385   //
 3386   // Inputs:
 3387   //   c_rarg0   - source byte array address
 3388   //   c_rarg1   - destination byte array address
 3389   //   c_rarg2   - sessionKd (key) in little endian int array
 3390   //
 3391   address generate_aescrypt_decryptBlock() {
 3392     assert(UseAES, "need AES cryptographic extension support");
 3393     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 3394     int entry_count = StubInfo::entry_count(stub_id);
 3395     assert(entry_count == 1, "sanity check");
 3396     address start = load_archive_data(stub_id);
 3397     if (start != nullptr) {
 3398       return start;
 3399     }
 3400     __ align(CodeEntryAlignment);
 3401     StubCodeMark mark(this, stub_id);
 3402     Label L_doLast;
 3403 
 3404     const Register from        = c_rarg0;  // source array address
 3405     const Register to          = c_rarg1;  // destination array address
 3406     const Register key         = c_rarg2;  // key array address
 3407     const Register keylen      = rscratch1;
 3408 
 3409     start = __ pc();
 3410     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3411 
 3412     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3413 
 3414     __ aesecb_decrypt(from, to, key, keylen);
 3415 
 3416     __ mov(r0, 0);
 3417 
 3418     __ leave();
 3419     __ ret(lr);
 3420 
 3421     // record the stub entry and end
 3422     store_archive_data(stub_id, start, __ pc());
 3423 
 3424     return start;
 3425   }
 3426 
 3427   // Arguments:
 3428   //
 3429   // Inputs:
 3430   //   c_rarg0   - source byte array address
 3431   //   c_rarg1   - destination byte array address
 3432   //   c_rarg2   - sessionKe (key) in little endian int array
 3433   //   c_rarg3   - r vector byte array address
 3434   //   c_rarg4   - input length
 3435   //
 3436   // Output:
 3437   //   x0        - input length
 3438   //
 3439   address generate_cipherBlockChaining_encryptAESCrypt() {
 3440     assert(UseAES, "need AES cryptographic extension support");
 3441     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 3442     int entry_count = StubInfo::entry_count(stub_id);
 3443     assert(entry_count == 1, "sanity check");
 3444     address start = load_archive_data(stub_id);
 3445     if (start != nullptr) {
 3446       return start;
 3447     }
 3448     __ align(CodeEntryAlignment);
 3449     StubCodeMark mark(this, stub_id);
 3450 
 3451     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3452 
 3453     const Register from        = c_rarg0;  // source array address
 3454     const Register to          = c_rarg1;  // destination array address
 3455     const Register key         = c_rarg2;  // key array address
 3456     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3457                                            // and left with the results of the last encryption block
 3458     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3459     const Register keylen      = rscratch1;
 3460 
 3461     start = __ pc();
 3462 
 3463       __ enter();
 3464 
 3465       __ movw(rscratch2, len_reg);
 3466 
 3467       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3468 
 3469       __ ld1(v0, __ T16B, rvec);
 3470 
 3471       __ cmpw(keylen, 52);
 3472       __ br(Assembler::CC, L_loadkeys_44);
 3473       __ br(Assembler::EQ, L_loadkeys_52);
 3474 
 3475       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3476       __ rev32(v17, __ T16B, v17);
 3477       __ rev32(v18, __ T16B, v18);
 3478     __ BIND(L_loadkeys_52);
 3479       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3480       __ rev32(v19, __ T16B, v19);
 3481       __ rev32(v20, __ T16B, v20);
 3482     __ BIND(L_loadkeys_44);
 3483       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3484       __ rev32(v21, __ T16B, v21);
 3485       __ rev32(v22, __ T16B, v22);
 3486       __ rev32(v23, __ T16B, v23);
 3487       __ rev32(v24, __ T16B, v24);
 3488       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3489       __ rev32(v25, __ T16B, v25);
 3490       __ rev32(v26, __ T16B, v26);
 3491       __ rev32(v27, __ T16B, v27);
 3492       __ rev32(v28, __ T16B, v28);
 3493       __ ld1(v29, v30, v31, __ T16B, key);
 3494       __ rev32(v29, __ T16B, v29);
 3495       __ rev32(v30, __ T16B, v30);
 3496       __ rev32(v31, __ T16B, v31);
 3497 
 3498     __ BIND(L_aes_loop);
 3499       __ ld1(v1, __ T16B, __ post(from, 16));
 3500       __ eor(v0, __ T16B, v0, v1);
 3501 
 3502       __ br(Assembler::CC, L_rounds_44);
 3503       __ br(Assembler::EQ, L_rounds_52);
 3504 
 3505       __ aese(v0, v17); __ aesmc(v0, v0);
 3506       __ aese(v0, v18); __ aesmc(v0, v0);
 3507     __ BIND(L_rounds_52);
 3508       __ aese(v0, v19); __ aesmc(v0, v0);
 3509       __ aese(v0, v20); __ aesmc(v0, v0);
 3510     __ BIND(L_rounds_44);
 3511       __ aese(v0, v21); __ aesmc(v0, v0);
 3512       __ aese(v0, v22); __ aesmc(v0, v0);
 3513       __ aese(v0, v23); __ aesmc(v0, v0);
 3514       __ aese(v0, v24); __ aesmc(v0, v0);
 3515       __ aese(v0, v25); __ aesmc(v0, v0);
 3516       __ aese(v0, v26); __ aesmc(v0, v0);
 3517       __ aese(v0, v27); __ aesmc(v0, v0);
 3518       __ aese(v0, v28); __ aesmc(v0, v0);
 3519       __ aese(v0, v29); __ aesmc(v0, v0);
 3520       __ aese(v0, v30);
 3521       __ eor(v0, __ T16B, v0, v31);
 3522 
 3523       __ st1(v0, __ T16B, __ post(to, 16));
 3524 
 3525       __ subw(len_reg, len_reg, 16);
 3526       __ cbnzw(len_reg, L_aes_loop);
 3527 
 3528       __ st1(v0, __ T16B, rvec);
 3529 
 3530       __ mov(r0, rscratch2);
 3531 
 3532       __ leave();
 3533       __ ret(lr);
 3534 
 3535       // record the stub entry and end
 3536       store_archive_data(stub_id, start, __ pc());
 3537 
 3538       return start;
 3539   }
 3540 
 3541   // Arguments:
 3542   //
 3543   // Inputs:
 3544   //   c_rarg0   - source byte array address
 3545   //   c_rarg1   - destination byte array address
 3546   //   c_rarg2   - sessionKd (key) in little endian int array
 3547   //   c_rarg3   - r vector byte array address
 3548   //   c_rarg4   - input length
 3549   //
 3550   // Output:
 3551   //   r0        - input length
 3552   //
 3553   address generate_cipherBlockChaining_decryptAESCrypt() {
 3554     assert(UseAES, "need AES cryptographic extension support");
 3555     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3556     int entry_count = StubInfo::entry_count(stub_id);
 3557     assert(entry_count == 1, "sanity check");
 3558     address start = load_archive_data(stub_id);
 3559     if (start != nullptr) {
 3560       return start;
 3561     }
 3562     __ align(CodeEntryAlignment);
 3563     StubCodeMark mark(this, stub_id);
 3564 
 3565     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3566 
 3567     const Register from        = c_rarg0;  // source array address
 3568     const Register to          = c_rarg1;  // destination array address
 3569     const Register key         = c_rarg2;  // key array address
 3570     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3571                                            // and left with the results of the last encryption block
 3572     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3573     const Register keylen      = rscratch1;
 3574 
 3575     start = __ pc();
 3576 
 3577       __ enter();
 3578 
 3579       __ movw(rscratch2, len_reg);
 3580 
 3581       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3582 
 3583       __ ld1(v2, __ T16B, rvec);
 3584 
 3585       __ ld1(v31, __ T16B, __ post(key, 16));
 3586       __ rev32(v31, __ T16B, v31);
 3587 
 3588       __ cmpw(keylen, 52);
 3589       __ br(Assembler::CC, L_loadkeys_44);
 3590       __ br(Assembler::EQ, L_loadkeys_52);
 3591 
 3592       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3593       __ rev32(v17, __ T16B, v17);
 3594       __ rev32(v18, __ T16B, v18);
 3595     __ BIND(L_loadkeys_52);
 3596       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3597       __ rev32(v19, __ T16B, v19);
 3598       __ rev32(v20, __ T16B, v20);
 3599     __ BIND(L_loadkeys_44);
 3600       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3601       __ rev32(v21, __ T16B, v21);
 3602       __ rev32(v22, __ T16B, v22);
 3603       __ rev32(v23, __ T16B, v23);
 3604       __ rev32(v24, __ T16B, v24);
 3605       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3606       __ rev32(v25, __ T16B, v25);
 3607       __ rev32(v26, __ T16B, v26);
 3608       __ rev32(v27, __ T16B, v27);
 3609       __ rev32(v28, __ T16B, v28);
 3610       __ ld1(v29, v30, __ T16B, key);
 3611       __ rev32(v29, __ T16B, v29);
 3612       __ rev32(v30, __ T16B, v30);
 3613 
 3614     __ BIND(L_aes_loop);
 3615       __ ld1(v0, __ T16B, __ post(from, 16));
 3616       __ orr(v1, __ T16B, v0, v0);
 3617 
 3618       __ br(Assembler::CC, L_rounds_44);
 3619       __ br(Assembler::EQ, L_rounds_52);
 3620 
 3621       __ aesd(v0, v17); __ aesimc(v0, v0);
 3622       __ aesd(v0, v18); __ aesimc(v0, v0);
 3623     __ BIND(L_rounds_52);
 3624       __ aesd(v0, v19); __ aesimc(v0, v0);
 3625       __ aesd(v0, v20); __ aesimc(v0, v0);
 3626     __ BIND(L_rounds_44);
 3627       __ aesd(v0, v21); __ aesimc(v0, v0);
 3628       __ aesd(v0, v22); __ aesimc(v0, v0);
 3629       __ aesd(v0, v23); __ aesimc(v0, v0);
 3630       __ aesd(v0, v24); __ aesimc(v0, v0);
 3631       __ aesd(v0, v25); __ aesimc(v0, v0);
 3632       __ aesd(v0, v26); __ aesimc(v0, v0);
 3633       __ aesd(v0, v27); __ aesimc(v0, v0);
 3634       __ aesd(v0, v28); __ aesimc(v0, v0);
 3635       __ aesd(v0, v29); __ aesimc(v0, v0);
 3636       __ aesd(v0, v30);
 3637       __ eor(v0, __ T16B, v0, v31);
 3638       __ eor(v0, __ T16B, v0, v2);
 3639 
 3640       __ st1(v0, __ T16B, __ post(to, 16));
 3641       __ orr(v2, __ T16B, v1, v1);
 3642 
 3643       __ subw(len_reg, len_reg, 16);
 3644       __ cbnzw(len_reg, L_aes_loop);
 3645 
 3646       __ st1(v2, __ T16B, rvec);
 3647 
 3648       __ mov(r0, rscratch2);
 3649 
 3650       __ leave();
 3651       __ ret(lr);
 3652 
 3653     // record the stub entry and end
 3654     store_archive_data(stub_id, start, __ pc());
 3655 
 3656     return start;
 3657   }
 3658 
 3659   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3660   // Inputs: 128-bits. in is preserved.
 3661   // The least-significant 64-bit word is in the upper dword of each vector.
 3662   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3663   // Output: result
 3664   void be_add_128_64(FloatRegister result, FloatRegister in,
 3665                      FloatRegister inc, FloatRegister tmp) {
 3666     assert_different_registers(result, tmp, inc);
 3667 
 3668     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3669                                            // input
 3670     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3671     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3672                                            // MSD == 0 (must be!) to LSD
 3673     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3674   }
 3675 
 3676   // CTR AES crypt.
 3677   // Arguments:
 3678   //
 3679   // Inputs:
 3680   //   c_rarg0   - source byte array address
 3681   //   c_rarg1   - destination byte array address
 3682   //   c_rarg2   - sessionKe (key) in little endian int array
 3683   //   c_rarg3   - counter vector byte array address
 3684   //   c_rarg4   - input length
 3685   //   c_rarg5   - saved encryptedCounter start
 3686   //   c_rarg6   - saved used length
 3687   //
 3688   // Output:
 3689   //   r0       - input length
 3690   //
 3691   address generate_counterMode_AESCrypt() {
 3692     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3693     int entry_count = StubInfo::entry_count(stub_id);
 3694     assert(entry_count == 1, "sanity check");
 3695     address start = load_archive_data(stub_id);
 3696     if (start != nullptr) {
 3697       return start;
 3698     }
 3699     const Register in = c_rarg0;
 3700     const Register out = c_rarg1;
 3701     const Register key = c_rarg2;
 3702     const Register counter = c_rarg3;
 3703     const Register saved_len = c_rarg4, len = r10;
 3704     const Register saved_encrypted_ctr = c_rarg5;
 3705     const Register used_ptr = c_rarg6, used = r12;
 3706 
 3707     const Register offset = r7;
 3708     const Register keylen = r11;
 3709 
 3710     const unsigned char block_size = 16;
 3711     const int bulk_width = 4;
 3712     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3713     // performance with larger data sizes, but it also means that the
 3714     // fast path isn't used until you have at least 8 blocks, and up
 3715     // to 127 bytes of data will be executed on the slow path. For
 3716     // that reason, and also so as not to blow away too much icache, 4
 3717     // blocks seems like a sensible compromise.
 3718 
 3719     // Algorithm:
 3720     //
 3721     //    if (len == 0) {
 3722     //        goto DONE;
 3723     //    }
 3724     //    int result = len;
 3725     //    do {
 3726     //        if (used >= blockSize) {
 3727     //            if (len >= bulk_width * blockSize) {
 3728     //                CTR_large_block();
 3729     //                if (len == 0)
 3730     //                    goto DONE;
 3731     //            }
 3732     //            for (;;) {
 3733     //                16ByteVector v0 = counter;
 3734     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3735     //                used = 0;
 3736     //                if (len < blockSize)
 3737     //                    break;    /* goto NEXT */
 3738     //                16ByteVector v1 = load16Bytes(in, offset);
 3739     //                v1 = v1 ^ encryptedCounter;
 3740     //                store16Bytes(out, offset);
 3741     //                used = blockSize;
 3742     //                offset += blockSize;
 3743     //                len -= blockSize;
 3744     //                if (len == 0)
 3745     //                    goto DONE;
 3746     //            }
 3747     //        }
 3748     //      NEXT:
 3749     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3750     //        len--;
 3751     //    } while (len != 0);
 3752     //  DONE:
 3753     //    return result;
 3754     //
 3755     // CTR_large_block()
 3756     //    Wide bulk encryption of whole blocks.
 3757 
 3758     __ align(CodeEntryAlignment);
 3759     StubCodeMark mark(this, stub_id);
 3760     start = __ pc();
 3761     __ enter();
 3762 
 3763     Label DONE, CTR_large_block, large_block_return;
 3764     __ ldrw(used, Address(used_ptr));
 3765     __ cbzw(saved_len, DONE);
 3766 
 3767     __ mov(len, saved_len);
 3768     __ mov(offset, 0);
 3769 
 3770     // Compute #rounds for AES based on the length of the key array
 3771     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3772 
 3773     __ aesenc_loadkeys(key, keylen);
 3774 
 3775     {
 3776       Label L_CTR_loop, NEXT;
 3777 
 3778       __ bind(L_CTR_loop);
 3779 
 3780       __ cmp(used, block_size);
 3781       __ br(__ LO, NEXT);
 3782 
 3783       // Maybe we have a lot of data
 3784       __ subsw(rscratch1, len, bulk_width * block_size);
 3785       __ br(__ HS, CTR_large_block);
 3786       __ BIND(large_block_return);
 3787       __ cbzw(len, DONE);
 3788 
 3789       // Setup the counter
 3790       __ movi(v4, __ T4S, 0);
 3791       __ movi(v5, __ T4S, 1);
 3792       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3793 
 3794       // 128-bit big-endian increment
 3795       __ ld1(v0, __ T16B, counter);
 3796       __ rev64(v16, __ T16B, v0);
 3797       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3798       __ rev64(v16, __ T16B, v16);
 3799       __ st1(v16, __ T16B, counter);
 3800       // Previous counter value is in v0
 3801       // v4 contains { 0, 1 }
 3802 
 3803       {
 3804         // We have fewer than bulk_width blocks of data left. Encrypt
 3805         // them one by one until there is less than a full block
 3806         // remaining, being careful to save both the encrypted counter
 3807         // and the counter.
 3808 
 3809         Label inner_loop;
 3810         __ bind(inner_loop);
 3811         // Counter to encrypt is in v0
 3812         __ aesecb_encrypt(noreg, noreg, keylen);
 3813         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3814 
 3815         // Do we have a remaining full block?
 3816 
 3817         __ mov(used, 0);
 3818         __ cmp(len, block_size);
 3819         __ br(__ LO, NEXT);
 3820 
 3821         // Yes, we have a full block
 3822         __ ldrq(v1, Address(in, offset));
 3823         __ eor(v1, __ T16B, v1, v0);
 3824         __ strq(v1, Address(out, offset));
 3825         __ mov(used, block_size);
 3826         __ add(offset, offset, block_size);
 3827 
 3828         __ subw(len, len, block_size);
 3829         __ cbzw(len, DONE);
 3830 
 3831         // Increment the counter, store it back
 3832         __ orr(v0, __ T16B, v16, v16);
 3833         __ rev64(v16, __ T16B, v16);
 3834         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3835         __ rev64(v16, __ T16B, v16);
 3836         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3837 
 3838         __ b(inner_loop);
 3839       }
 3840 
 3841       __ BIND(NEXT);
 3842 
 3843       // Encrypt a single byte, and loop.
 3844       // We expect this to be a rare event.
 3845       __ ldrb(rscratch1, Address(in, offset));
 3846       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3847       __ eor(rscratch1, rscratch1, rscratch2);
 3848       __ strb(rscratch1, Address(out, offset));
 3849       __ add(offset, offset, 1);
 3850       __ add(used, used, 1);
 3851       __ subw(len, len,1);
 3852       __ cbnzw(len, L_CTR_loop);
 3853     }
 3854 
 3855     __ bind(DONE);
 3856     __ strw(used, Address(used_ptr));
 3857     __ mov(r0, saved_len);
 3858 
 3859     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3860     __ ret(lr);
 3861 
 3862     // Bulk encryption
 3863 
 3864     __ BIND (CTR_large_block);
 3865     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3866 
 3867     if (bulk_width == 8) {
 3868       __ sub(sp, sp, 4 * 16);
 3869       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3870     }
 3871     __ sub(sp, sp, 4 * 16);
 3872     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3873     RegSet saved_regs = (RegSet::of(in, out, offset)
 3874                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3875     __ push(saved_regs, sp);
 3876     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3877     __ add(in, in, offset);
 3878     __ add(out, out, offset);
 3879 
 3880     // Keys should already be loaded into the correct registers
 3881 
 3882     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3883     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3884 
 3885     // AES/CTR loop
 3886     {
 3887       Label L_CTR_loop;
 3888       __ BIND(L_CTR_loop);
 3889 
 3890       // Setup the counters
 3891       __ movi(v8, __ T4S, 0);
 3892       __ movi(v9, __ T4S, 1);
 3893       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3894 
 3895       for (int i = 0; i < bulk_width; i++) {
 3896         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3897         __ rev64(v0_ofs, __ T16B, v16);
 3898         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3899       }
 3900 
 3901       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3902 
 3903       // Encrypt the counters
 3904       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3905 
 3906       if (bulk_width == 8) {
 3907         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3908       }
 3909 
 3910       // XOR the encrypted counters with the inputs
 3911       for (int i = 0; i < bulk_width; i++) {
 3912         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3913         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3914         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3915       }
 3916 
 3917       // Write the encrypted data
 3918       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3919       if (bulk_width == 8) {
 3920         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3921       }
 3922 
 3923       __ subw(len, len, 16 * bulk_width);
 3924       __ cbnzw(len, L_CTR_loop);
 3925     }
 3926 
 3927     // Save the counter back where it goes
 3928     __ rev64(v16, __ T16B, v16);
 3929     __ st1(v16, __ T16B, counter);
 3930 
 3931     __ pop(saved_regs, sp);
 3932 
 3933     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3934     if (bulk_width == 8) {
 3935       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3936     }
 3937 
 3938     __ andr(rscratch1, len, -16 * bulk_width);
 3939     __ sub(len, len, rscratch1);
 3940     __ add(offset, offset, rscratch1);
 3941     __ mov(used, 16);
 3942     __ strw(used, Address(used_ptr));
 3943     __ b(large_block_return);
 3944 
 3945     // record the stub entry and end
 3946     store_archive_data(stub_id, start, __ pc());
 3947 
 3948     return start;
 3949   }
 3950 
 3951   // Vector AES Galois Counter Mode implementation. Parameters:
 3952   //
 3953   // in = c_rarg0
 3954   // len = c_rarg1
 3955   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3956   // out = c_rarg3
 3957   // key = c_rarg4
 3958   // state = c_rarg5 - GHASH.state
 3959   // subkeyHtbl = c_rarg6 - powers of H
 3960   // counter = c_rarg7 - 16 bytes of CTR
 3961   // return - number of processed bytes
 3962   address generate_galoisCounterMode_AESCrypt() {
 3963     Label ghash_polynomial; // local data generated after code
 3964     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3965     int entry_count = StubInfo::entry_count(stub_id);
 3966     assert(entry_count == 1, "sanity check");
 3967     address start = load_archive_data(stub_id);
 3968     if (start != nullptr) {
 3969       return start;
 3970     }
 3971     __ align(CodeEntryAlignment);
 3972     StubCodeMark mark(this, stub_id);
 3973     start = __ pc();
 3974     __ enter();
 3975 
 3976     const Register in = c_rarg0;
 3977     const Register len = c_rarg1;
 3978     const Register ct = c_rarg2;
 3979     const Register out = c_rarg3;
 3980     // and updated with the incremented counter in the end
 3981 
 3982     const Register key = c_rarg4;
 3983     const Register state = c_rarg5;
 3984 
 3985     const Register subkeyHtbl = c_rarg6;
 3986 
 3987     const Register counter = c_rarg7;
 3988 
 3989     const Register keylen = r10;
 3990     // Save state before entering routine
 3991     __ sub(sp, sp, 4 * 16);
 3992     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3993     __ sub(sp, sp, 4 * 16);
 3994     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3995 
 3996     // __ andr(len, len, -512);
 3997     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3998     __ str(len, __ pre(sp, -2 * wordSize));
 3999 
 4000     Label DONE;
 4001     __ cbz(len, DONE);
 4002 
 4003     // Compute #rounds for AES based on the length of the key array
 4004     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 4005 
 4006     __ aesenc_loadkeys(key, keylen);
 4007     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 4008     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 4009 
 4010     // AES/CTR loop
 4011     {
 4012       Label L_CTR_loop;
 4013       __ BIND(L_CTR_loop);
 4014 
 4015       // Setup the counters
 4016       __ movi(v8, __ T4S, 0);
 4017       __ movi(v9, __ T4S, 1);
 4018       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 4019 
 4020       assert(v0->encoding() < v8->encoding(), "");
 4021       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 4022         FloatRegister f = as_FloatRegister(i);
 4023         __ rev32(f, __ T16B, v16);
 4024         __ addv(v16, __ T4S, v16, v8);
 4025       }
 4026 
 4027       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 4028 
 4029       // Encrypt the counters
 4030       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 4031 
 4032       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 4033 
 4034       // XOR the encrypted counters with the inputs
 4035       for (int i = 0; i < 8; i++) {
 4036         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 4037         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 4038         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 4039       }
 4040       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 4041       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 4042 
 4043       __ subw(len, len, 16 * 8);
 4044       __ cbnzw(len, L_CTR_loop);
 4045     }
 4046 
 4047     __ rev32(v16, __ T16B, v16);
 4048     __ st1(v16, __ T16B, counter);
 4049 
 4050     __ ldr(len, Address(sp));
 4051     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 4052 
 4053     // GHASH/CTR loop
 4054     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 4055                                 len, /*unrolls*/4);
 4056 
 4057 #ifdef ASSERT
 4058     { Label L;
 4059       __ cmp(len, (unsigned char)0);
 4060       __ br(Assembler::EQ, L);
 4061       __ stop("stubGenerator: abort");
 4062       __ bind(L);
 4063   }
 4064 #endif
 4065 
 4066   __ bind(DONE);
 4067     // Return the number of bytes processed
 4068     __ ldr(r0, __ post(sp, 2 * wordSize));
 4069 
 4070     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 4071     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 4072 
 4073     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4074     __ ret(lr);
 4075 
 4076     // bind label and generate polynomial data
 4077     __ align(wordSize * 2);
 4078     __ bind(ghash_polynomial);
 4079     __ emit_int64(0x87);  // The low-order bits of the field
 4080                           // polynomial (i.e. p = z^7+z^2+z+1)
 4081                           // repeated in the low and high parts of a
 4082                           // 128-bit vector
 4083     __ emit_int64(0x87);
 4084 
 4085     // record the stub entry and end
 4086     store_archive_data(stub_id, start, __ pc());
 4087 
 4088     return start;
 4089   }
 4090 
 4091   class Cached64Bytes {
 4092   private:
 4093     MacroAssembler *_masm;
 4094     Register _regs[8];
 4095 
 4096   public:
 4097     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 4098       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 4099       auto it = rs.begin();
 4100       for (auto &r: _regs) {
 4101         r = *it;
 4102         ++it;
 4103       }
 4104     }
 4105 
 4106     void gen_loads(Register base) {
 4107       for (int i = 0; i < 8; i += 2) {
 4108         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 4109       }
 4110     }
 4111 
 4112     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 4113     void extract_u32(Register dest, int i) {
 4114       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 4115     }
 4116   };
 4117 
 4118   // Utility routines for md5.
 4119   // Clobbers r10 and r11.
 4120   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4121               int k, int s, int t) {
 4122     Register rscratch3 = r10;
 4123     Register rscratch4 = r11;
 4124 
 4125     __ eorw(rscratch3, r3, r4);
 4126     __ movw(rscratch2, t);
 4127     __ andw(rscratch3, rscratch3, r2);
 4128     __ addw(rscratch4, r1, rscratch2);
 4129     reg_cache.extract_u32(rscratch1, k);
 4130     __ eorw(rscratch3, rscratch3, r4);
 4131     __ addw(rscratch4, rscratch4, rscratch1);
 4132     __ addw(rscratch3, rscratch3, rscratch4);
 4133     __ rorw(rscratch2, rscratch3, 32 - s);
 4134     __ addw(r1, rscratch2, r2);
 4135   }
 4136 
 4137   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4138               int k, int s, int t) {
 4139     Register rscratch3 = r10;
 4140     Register rscratch4 = r11;
 4141 
 4142     reg_cache.extract_u32(rscratch1, k);
 4143     __ movw(rscratch2, t);
 4144     __ addw(rscratch4, r1, rscratch2);
 4145     __ addw(rscratch4, rscratch4, rscratch1);
 4146     __ bicw(rscratch2, r3, r4);
 4147     __ andw(rscratch3, r2, r4);
 4148     __ addw(rscratch2, rscratch2, rscratch4);
 4149     __ addw(rscratch2, rscratch2, rscratch3);
 4150     __ rorw(rscratch2, rscratch2, 32 - s);
 4151     __ addw(r1, rscratch2, r2);
 4152   }
 4153 
 4154   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4155               int k, int s, int t) {
 4156     Register rscratch3 = r10;
 4157     Register rscratch4 = r11;
 4158 
 4159     __ eorw(rscratch3, r3, r4);
 4160     __ movw(rscratch2, t);
 4161     __ addw(rscratch4, r1, rscratch2);
 4162     reg_cache.extract_u32(rscratch1, k);
 4163     __ eorw(rscratch3, rscratch3, r2);
 4164     __ addw(rscratch4, rscratch4, rscratch1);
 4165     __ addw(rscratch3, rscratch3, rscratch4);
 4166     __ rorw(rscratch2, rscratch3, 32 - s);
 4167     __ addw(r1, rscratch2, r2);
 4168   }
 4169 
 4170   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4171               int k, int s, int t) {
 4172     Register rscratch3 = r10;
 4173     Register rscratch4 = r11;
 4174 
 4175     __ movw(rscratch3, t);
 4176     __ ornw(rscratch2, r2, r4);
 4177     __ addw(rscratch4, r1, rscratch3);
 4178     reg_cache.extract_u32(rscratch1, k);
 4179     __ eorw(rscratch3, rscratch2, r3);
 4180     __ addw(rscratch4, rscratch4, rscratch1);
 4181     __ addw(rscratch3, rscratch3, rscratch4);
 4182     __ rorw(rscratch2, rscratch3, 32 - s);
 4183     __ addw(r1, rscratch2, r2);
 4184   }
 4185 
 4186   // Arguments:
 4187   //
 4188   // Inputs:
 4189   //   c_rarg0   - byte[]  source+offset
 4190   //   c_rarg1   - int[]   SHA.state
 4191   //   c_rarg2   - int     offset
 4192   //   c_rarg3   - int     limit
 4193   //
 4194   address generate_md5_implCompress(StubId stub_id) {
 4195     bool multi_block;
 4196     switch (stub_id) {
 4197     case StubId::stubgen_md5_implCompress_id:
 4198       multi_block = false;
 4199       break;
 4200     case StubId::stubgen_md5_implCompressMB_id:
 4201       multi_block = true;
 4202       break;
 4203     default:
 4204       ShouldNotReachHere();
 4205     }
 4206     int entry_count = StubInfo::entry_count(stub_id);
 4207     assert(entry_count == 1, "sanity check");
 4208     address start = load_archive_data(stub_id);
 4209     if (start != nullptr) {
 4210       return start;
 4211     }
 4212     __ align(CodeEntryAlignment);
 4213 
 4214     StubCodeMark mark(this, stub_id);
 4215     start = __ pc();
 4216 
 4217     Register buf       = c_rarg0;
 4218     Register state     = c_rarg1;
 4219     Register ofs       = c_rarg2;
 4220     Register limit     = c_rarg3;
 4221     Register a         = r4;
 4222     Register b         = r5;
 4223     Register c         = r6;
 4224     Register d         = r7;
 4225     Register rscratch3 = r10;
 4226     Register rscratch4 = r11;
 4227 
 4228     Register state_regs[2] = { r12, r13 };
 4229     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 4230     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 4231 
 4232     __ push(saved_regs, sp);
 4233 
 4234     __ ldp(state_regs[0], state_regs[1], Address(state));
 4235     __ ubfx(a, state_regs[0],  0, 32);
 4236     __ ubfx(b, state_regs[0], 32, 32);
 4237     __ ubfx(c, state_regs[1],  0, 32);
 4238     __ ubfx(d, state_regs[1], 32, 32);
 4239 
 4240     Label md5_loop;
 4241     __ BIND(md5_loop);
 4242 
 4243     reg_cache.gen_loads(buf);
 4244 
 4245     // Round 1
 4246     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 4247     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 4248     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 4249     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 4250     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 4251     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 4252     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 4253     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 4254     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 4255     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 4256     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 4257     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 4258     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 4259     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 4260     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 4261     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 4262 
 4263     // Round 2
 4264     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 4265     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 4266     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 4267     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 4268     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 4269     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 4270     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 4271     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 4272     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 4273     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 4274     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 4275     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 4276     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 4277     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 4278     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 4279     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 4280 
 4281     // Round 3
 4282     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 4283     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 4284     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 4285     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 4286     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 4287     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 4288     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 4289     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 4290     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 4291     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 4292     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 4293     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 4294     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 4295     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 4296     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 4297     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 4298 
 4299     // Round 4
 4300     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 4301     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 4302     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 4303     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 4304     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 4305     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 4306     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 4307     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 4308     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 4309     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 4310     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 4311     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 4312     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 4313     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 4314     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 4315     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 4316 
 4317     __ addw(a, state_regs[0], a);
 4318     __ ubfx(rscratch2, state_regs[0], 32, 32);
 4319     __ addw(b, rscratch2, b);
 4320     __ addw(c, state_regs[1], c);
 4321     __ ubfx(rscratch4, state_regs[1], 32, 32);
 4322     __ addw(d, rscratch4, d);
 4323 
 4324     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 4325     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 4326 
 4327     if (multi_block) {
 4328       __ add(buf, buf, 64);
 4329       __ add(ofs, ofs, 64);
 4330       __ cmp(ofs, limit);
 4331       __ br(Assembler::LE, md5_loop);
 4332       __ mov(c_rarg0, ofs); // return ofs
 4333     }
 4334 
 4335     // write hash values back in the correct order
 4336     __ stp(state_regs[0], state_regs[1], Address(state));
 4337 
 4338     __ pop(saved_regs, sp);
 4339 
 4340     __ ret(lr);
 4341 
 4342     // record the stub entry and end
 4343     store_archive_data(stub_id, start, __ pc());
 4344 
 4345     return start;
 4346   }
 4347 
 4348   // Arguments:
 4349   //
 4350   // Inputs:
 4351   //   c_rarg0   - byte[]  source+offset
 4352   //   c_rarg1   - int[]   SHA.state
 4353   //   c_rarg2   - int     offset
 4354   //   c_rarg3   - int     limit
 4355   //
 4356   address generate_sha1_implCompress(StubId stub_id) {
 4357     bool multi_block;
 4358     switch (stub_id) {
 4359     case StubId::stubgen_sha1_implCompress_id:
 4360       multi_block = false;
 4361       break;
 4362     case StubId::stubgen_sha1_implCompressMB_id:
 4363       multi_block = true;
 4364       break;
 4365     default:
 4366       ShouldNotReachHere();
 4367     }
 4368     int entry_count = StubInfo::entry_count(stub_id);
 4369     assert(entry_count == 1, "sanity check");
 4370     address start = load_archive_data(stub_id);
 4371     if (start != nullptr) {
 4372       return start;
 4373     }
 4374     __ align(CodeEntryAlignment);
 4375 
 4376     StubCodeMark mark(this, stub_id);
 4377     start = __ pc();
 4378 
 4379     Register buf   = c_rarg0;
 4380     Register state = c_rarg1;
 4381     Register ofs   = c_rarg2;
 4382     Register limit = c_rarg3;
 4383 
 4384     Label keys;
 4385     Label sha1_loop;
 4386 
 4387     // load the keys into v0..v3
 4388     __ adr(rscratch1, keys);
 4389     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 4390     // load 5 words state into v6, v7
 4391     __ ldrq(v6, Address(state, 0));
 4392     __ ldrs(v7, Address(state, 16));
 4393 
 4394 
 4395     __ BIND(sha1_loop);
 4396     // load 64 bytes of data into v16..v19
 4397     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4398     __ rev32(v16, __ T16B, v16);
 4399     __ rev32(v17, __ T16B, v17);
 4400     __ rev32(v18, __ T16B, v18);
 4401     __ rev32(v19, __ T16B, v19);
 4402 
 4403     // do the sha1
 4404     __ addv(v4, __ T4S, v16, v0);
 4405     __ orr(v20, __ T16B, v6, v6);
 4406 
 4407     FloatRegister d0 = v16;
 4408     FloatRegister d1 = v17;
 4409     FloatRegister d2 = v18;
 4410     FloatRegister d3 = v19;
 4411 
 4412     for (int round = 0; round < 20; round++) {
 4413       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 4414       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 4415       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 4416       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 4417       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 4418 
 4419       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 4420       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 4421       __ sha1h(tmp2, __ T4S, v20);
 4422       if (round < 5)
 4423         __ sha1c(v20, __ T4S, tmp3, tmp4);
 4424       else if (round < 10 || round >= 15)
 4425         __ sha1p(v20, __ T4S, tmp3, tmp4);
 4426       else
 4427         __ sha1m(v20, __ T4S, tmp3, tmp4);
 4428       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 4429 
 4430       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4431     }
 4432 
 4433     __ addv(v7, __ T2S, v7, v21);
 4434     __ addv(v6, __ T4S, v6, v20);
 4435 
 4436     if (multi_block) {
 4437       __ add(ofs, ofs, 64);
 4438       __ cmp(ofs, limit);
 4439       __ br(Assembler::LE, sha1_loop);
 4440       __ mov(c_rarg0, ofs); // return ofs
 4441     }
 4442 
 4443     __ strq(v6, Address(state, 0));
 4444     __ strs(v7, Address(state, 16));
 4445 
 4446     __ ret(lr);
 4447 
 4448     __ bind(keys);
 4449     __ emit_int32(0x5a827999);
 4450     __ emit_int32(0x6ed9eba1);
 4451     __ emit_int32(0x8f1bbcdc);
 4452     __ emit_int32(0xca62c1d6);
 4453 
 4454     // record the stub entry and end
 4455     store_archive_data(stub_id, start, __ pc());
 4456 
 4457     return start;
 4458   }
 4459 
 4460 
 4461   // Arguments:
 4462   //
 4463   // Inputs:
 4464   //   c_rarg0   - byte[]  source+offset
 4465   //   c_rarg1   - int[]   SHA.state
 4466   //   c_rarg2   - int     offset
 4467   //   c_rarg3   - int     limit
 4468   //
 4469   address generate_sha256_implCompress(StubId stub_id) {
 4470     bool multi_block;
 4471     switch (stub_id) {
 4472     case StubId::stubgen_sha256_implCompress_id:
 4473       multi_block = false;
 4474       break;
 4475     case StubId::stubgen_sha256_implCompressMB_id:
 4476       multi_block = true;
 4477       break;
 4478     default:
 4479       ShouldNotReachHere();
 4480     }
 4481     int entry_count = StubInfo::entry_count(stub_id);
 4482     assert(entry_count == 1, "sanity check");
 4483     address start = load_archive_data(stub_id);
 4484     if (start != nullptr) {
 4485       return start;
 4486     }
 4487     __ align(CodeEntryAlignment);
 4488     StubCodeMark mark(this, stub_id);
 4489     start = __ pc();
 4490 
 4491     Register buf   = c_rarg0;
 4492     Register state = c_rarg1;
 4493     Register ofs   = c_rarg2;
 4494     Register limit = c_rarg3;
 4495 
 4496     Label sha1_loop;
 4497 
 4498     __ stpd(v8, v9, __ pre(sp, -32));
 4499     __ stpd(v10, v11, Address(sp, 16));
 4500 
 4501 // dga == v0
 4502 // dgb == v1
 4503 // dg0 == v2
 4504 // dg1 == v3
 4505 // dg2 == v4
 4506 // t0 == v6
 4507 // t1 == v7
 4508 
 4509     // load 16 keys to v16..v31
 4510     __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
 4511     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 4512     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 4513     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 4514     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 4515 
 4516     // load 8 words (256 bits) state
 4517     __ ldpq(v0, v1, state);
 4518 
 4519     __ BIND(sha1_loop);
 4520     // load 64 bytes of data into v8..v11
 4521     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4522     __ rev32(v8, __ T16B, v8);
 4523     __ rev32(v9, __ T16B, v9);
 4524     __ rev32(v10, __ T16B, v10);
 4525     __ rev32(v11, __ T16B, v11);
 4526 
 4527     __ addv(v6, __ T4S, v8, v16);
 4528     __ orr(v2, __ T16B, v0, v0);
 4529     __ orr(v3, __ T16B, v1, v1);
 4530 
 4531     FloatRegister d0 = v8;
 4532     FloatRegister d1 = v9;
 4533     FloatRegister d2 = v10;
 4534     FloatRegister d3 = v11;
 4535 
 4536 
 4537     for (int round = 0; round < 16; round++) {
 4538       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4539       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4540       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4541       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4542 
 4543       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4544        __ orr(v4, __ T16B, v2, v2);
 4545       if (round < 15)
 4546         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4547       __ sha256h(v2, __ T4S, v3, tmp2);
 4548       __ sha256h2(v3, __ T4S, v4, tmp2);
 4549       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4550 
 4551       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4552     }
 4553 
 4554     __ addv(v0, __ T4S, v0, v2);
 4555     __ addv(v1, __ T4S, v1, v3);
 4556 
 4557     if (multi_block) {
 4558       __ add(ofs, ofs, 64);
 4559       __ cmp(ofs, limit);
 4560       __ br(Assembler::LE, sha1_loop);
 4561       __ mov(c_rarg0, ofs); // return ofs
 4562     }
 4563 
 4564     __ ldpd(v10, v11, Address(sp, 16));
 4565     __ ldpd(v8, v9, __ post(sp, 32));
 4566 
 4567     __ stpq(v0, v1, state);
 4568 
 4569     __ ret(lr);
 4570 
 4571     // record the stub entry and end
 4572     store_archive_data(stub_id, start, __ pc());
 4573 
 4574     return start;
 4575   }
 4576 
 4577   // Double rounds for sha512.
 4578   void sha512_dround(int dr,
 4579                      FloatRegister vi0, FloatRegister vi1,
 4580                      FloatRegister vi2, FloatRegister vi3,
 4581                      FloatRegister vi4, FloatRegister vrc0,
 4582                      FloatRegister vrc1, FloatRegister vin0,
 4583                      FloatRegister vin1, FloatRegister vin2,
 4584                      FloatRegister vin3, FloatRegister vin4) {
 4585       if (dr < 36) {
 4586         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4587       }
 4588       __ addv(v5, __ T2D, vrc0, vin0);
 4589       __ ext(v6, __ T16B, vi2, vi3, 8);
 4590       __ ext(v5, __ T16B, v5, v5, 8);
 4591       __ ext(v7, __ T16B, vi1, vi2, 8);
 4592       __ addv(vi3, __ T2D, vi3, v5);
 4593       if (dr < 32) {
 4594         __ ext(v5, __ T16B, vin3, vin4, 8);
 4595         __ sha512su0(vin0, __ T2D, vin1);
 4596       }
 4597       __ sha512h(vi3, __ T2D, v6, v7);
 4598       if (dr < 32) {
 4599         __ sha512su1(vin0, __ T2D, vin2, v5);
 4600       }
 4601       __ addv(vi4, __ T2D, vi1, vi3);
 4602       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4603   }
 4604 
 4605   // Arguments:
 4606   //
 4607   // Inputs:
 4608   //   c_rarg0   - byte[]  source+offset
 4609   //   c_rarg1   - int[]   SHA.state
 4610   //   c_rarg2   - int     offset
 4611   //   c_rarg3   - int     limit
 4612   //
 4613   address generate_sha512_implCompress(StubId stub_id) {
 4614     bool multi_block;
 4615     switch (stub_id) {
 4616     case StubId::stubgen_sha512_implCompress_id:
 4617       multi_block = false;
 4618       break;
 4619     case StubId::stubgen_sha512_implCompressMB_id:
 4620       multi_block = true;
 4621       break;
 4622     default:
 4623       ShouldNotReachHere();
 4624     }
 4625     int entry_count = StubInfo::entry_count(stub_id);
 4626     assert(entry_count == 1, "sanity check");
 4627     address start = load_archive_data(stub_id);
 4628     if (start != nullptr) {
 4629       return start;
 4630     }
 4631     __ align(CodeEntryAlignment);
 4632     StubCodeMark mark(this, stub_id);
 4633     start = __ pc();
 4634 
 4635     Register buf   = c_rarg0;
 4636     Register state = c_rarg1;
 4637     Register ofs   = c_rarg2;
 4638     Register limit = c_rarg3;
 4639 
 4640     __ stpd(v8, v9, __ pre(sp, -64));
 4641     __ stpd(v10, v11, Address(sp, 16));
 4642     __ stpd(v12, v13, Address(sp, 32));
 4643     __ stpd(v14, v15, Address(sp, 48));
 4644 
 4645     Label sha512_loop;
 4646 
 4647     // load state
 4648     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4649 
 4650     // load first 4 round constants
 4651     __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
 4652     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4653 
 4654     __ BIND(sha512_loop);
 4655     // load 128B of data into v12..v19
 4656     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4657     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4658     __ rev64(v12, __ T16B, v12);
 4659     __ rev64(v13, __ T16B, v13);
 4660     __ rev64(v14, __ T16B, v14);
 4661     __ rev64(v15, __ T16B, v15);
 4662     __ rev64(v16, __ T16B, v16);
 4663     __ rev64(v17, __ T16B, v17);
 4664     __ rev64(v18, __ T16B, v18);
 4665     __ rev64(v19, __ T16B, v19);
 4666 
 4667     __ mov(rscratch2, rscratch1);
 4668 
 4669     __ mov(v0, __ T16B, v8);
 4670     __ mov(v1, __ T16B, v9);
 4671     __ mov(v2, __ T16B, v10);
 4672     __ mov(v3, __ T16B, v11);
 4673 
 4674     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4675     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4676     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4677     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4678     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4679     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4680     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4681     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4682     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4683     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4684     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4685     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4686     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4687     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4688     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4689     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4690     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4691     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4692     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4693     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4694     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4695     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4696     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4697     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4698     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4699     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4700     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4701     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4702     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4703     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4704     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4705     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4706     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4707     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4708     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4709     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4710     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4711     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4712     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4713     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4714 
 4715     __ addv(v8, __ T2D, v8, v0);
 4716     __ addv(v9, __ T2D, v9, v1);
 4717     __ addv(v10, __ T2D, v10, v2);
 4718     __ addv(v11, __ T2D, v11, v3);
 4719 
 4720     if (multi_block) {
 4721       __ add(ofs, ofs, 128);
 4722       __ cmp(ofs, limit);
 4723       __ br(Assembler::LE, sha512_loop);
 4724       __ mov(c_rarg0, ofs); // return ofs
 4725     }
 4726 
 4727     __ st1(v8, v9, v10, v11, __ T2D, state);
 4728 
 4729     __ ldpd(v14, v15, Address(sp, 48));
 4730     __ ldpd(v12, v13, Address(sp, 32));
 4731     __ ldpd(v10, v11, Address(sp, 16));
 4732     __ ldpd(v8, v9, __ post(sp, 64));
 4733 
 4734     __ ret(lr);
 4735 
 4736     // record the stub entry and end
 4737     store_archive_data(stub_id, start, __ pc());
 4738 
 4739     return start;
 4740   }
 4741 
 4742   // Execute one round of keccak of two computations in parallel.
 4743   // One of the states should be loaded into the lower halves of
 4744   // the vector registers v0-v24, the other should be loaded into
 4745   // the upper halves of those registers. The ld1r instruction loads
 4746   // the round constant into both halves of register v31.
 4747   // Intermediate results c0...c5 and d0...d5 are computed
 4748   // in registers v25...v30.
 4749   // All vector instructions that are used operate on both register
 4750   // halves in parallel.
 4751   // If only a single computation is needed, one can only load the lower halves.
 4752   void keccak_round(Register rscratch1) {
 4753   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4754   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4755   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4756   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4757   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4758   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4759   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4760   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4761   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4762   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4763 
 4764   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4765   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4766   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4767   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4768   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4769 
 4770   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4771   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4772   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4773   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4774   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4775   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4776   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4777   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4778   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4779   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4780   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4781   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4782   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4783   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4784   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4785   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4786   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4787   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4788   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4789   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4790   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4791   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4792   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4793   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4794   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4795 
 4796   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4797   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4798   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4799   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4800   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4801 
 4802   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4803 
 4804   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4805   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4806   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4807   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4808   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4809 
 4810   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4811   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4812   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4813   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4814   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4815 
 4816   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4817   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4818   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4819   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4820   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4821 
 4822   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4823   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4824   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4825   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4826   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4827 
 4828   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4829   }
 4830 
 4831   // Arguments:
 4832   //
 4833   // Inputs:
 4834   //   c_rarg0   - byte[]  source+offset
 4835   //   c_rarg1   - byte[]  SHA.state
 4836   //   c_rarg2   - int     block_size
 4837   //   c_rarg3   - int     offset
 4838   //   c_rarg4   - int     limit
 4839   //
 4840   address generate_sha3_implCompress(StubId stub_id) {
 4841     bool multi_block;
 4842     switch (stub_id) {
 4843     case StubId::stubgen_sha3_implCompress_id:
 4844       multi_block = false;
 4845       break;
 4846     case StubId::stubgen_sha3_implCompressMB_id:
 4847       multi_block = true;
 4848       break;
 4849     default:
 4850       ShouldNotReachHere();
 4851     }
 4852     int entry_count = StubInfo::entry_count(stub_id);
 4853     assert(entry_count == 1, "sanity check");
 4854     address start = load_archive_data(stub_id);
 4855     if (start != nullptr) {
 4856       return start;
 4857     }
 4858     __ align(CodeEntryAlignment);
 4859     StubCodeMark mark(this, stub_id);
 4860     start = __ pc();
 4861 
 4862     Register buf           = c_rarg0;
 4863     Register state         = c_rarg1;
 4864     Register block_size    = c_rarg2;
 4865     Register ofs           = c_rarg3;
 4866     Register limit         = c_rarg4;
 4867 
 4868     Label sha3_loop, rounds24_loop;
 4869     Label sha3_512_or_sha3_384, shake128;
 4870 
 4871     __ stpd(v8, v9, __ pre(sp, -64));
 4872     __ stpd(v10, v11, Address(sp, 16));
 4873     __ stpd(v12, v13, Address(sp, 32));
 4874     __ stpd(v14, v15, Address(sp, 48));
 4875 
 4876     // load state
 4877     __ add(rscratch1, state, 32);
 4878     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4879     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4880     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4881     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4882     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4883     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4884     __ ld1(v24, __ T1D, rscratch1);
 4885 
 4886     __ BIND(sha3_loop);
 4887 
 4888     // 24 keccak rounds
 4889     __ movw(rscratch2, 24);
 4890 
 4891     // load round_constants base
 4892     __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
 4893 
 4894     // load input
 4895     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4896     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4897     __ eor(v0, __ T8B, v0, v25);
 4898     __ eor(v1, __ T8B, v1, v26);
 4899     __ eor(v2, __ T8B, v2, v27);
 4900     __ eor(v3, __ T8B, v3, v28);
 4901     __ eor(v4, __ T8B, v4, v29);
 4902     __ eor(v5, __ T8B, v5, v30);
 4903     __ eor(v6, __ T8B, v6, v31);
 4904 
 4905     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4906     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4907 
 4908     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4909     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4910     __ eor(v7, __ T8B, v7, v25);
 4911     __ eor(v8, __ T8B, v8, v26);
 4912     __ eor(v9, __ T8B, v9, v27);
 4913     __ eor(v10, __ T8B, v10, v28);
 4914     __ eor(v11, __ T8B, v11, v29);
 4915     __ eor(v12, __ T8B, v12, v30);
 4916     __ eor(v13, __ T8B, v13, v31);
 4917 
 4918     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4919     __ eor(v14, __ T8B, v14, v25);
 4920     __ eor(v15, __ T8B, v15, v26);
 4921     __ eor(v16, __ T8B, v16, v27);
 4922 
 4923     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4924     __ andw(c_rarg5, block_size, 48);
 4925     __ cbzw(c_rarg5, rounds24_loop);
 4926 
 4927     __ tbnz(block_size, 5, shake128);
 4928     // block_size == 144, bit5 == 0, SHA3-224
 4929     __ ldrd(v28, __ post(buf, 8));
 4930     __ eor(v17, __ T8B, v17, v28);
 4931     __ b(rounds24_loop);
 4932 
 4933     __ BIND(shake128);
 4934     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4935     __ eor(v17, __ T8B, v17, v28);
 4936     __ eor(v18, __ T8B, v18, v29);
 4937     __ eor(v19, __ T8B, v19, v30);
 4938     __ eor(v20, __ T8B, v20, v31);
 4939     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4940 
 4941     __ BIND(sha3_512_or_sha3_384);
 4942     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4943     __ eor(v7, __ T8B, v7, v25);
 4944     __ eor(v8, __ T8B, v8, v26);
 4945     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4946 
 4947     // SHA3-384
 4948     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4949     __ eor(v9,  __ T8B, v9,  v27);
 4950     __ eor(v10, __ T8B, v10, v28);
 4951     __ eor(v11, __ T8B, v11, v29);
 4952     __ eor(v12, __ T8B, v12, v30);
 4953 
 4954     __ BIND(rounds24_loop);
 4955     __ subw(rscratch2, rscratch2, 1);
 4956 
 4957     keccak_round(rscratch1);
 4958 
 4959     __ cbnzw(rscratch2, rounds24_loop);
 4960 
 4961     if (multi_block) {
 4962       __ add(ofs, ofs, block_size);
 4963       __ cmp(ofs, limit);
 4964       __ br(Assembler::LE, sha3_loop);
 4965       __ mov(c_rarg0, ofs); // return ofs
 4966     }
 4967 
 4968     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4969     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4970     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4971     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4972     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4973     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4974     __ st1(v24, __ T1D, state);
 4975 
 4976     // restore callee-saved registers
 4977     __ ldpd(v14, v15, Address(sp, 48));
 4978     __ ldpd(v12, v13, Address(sp, 32));
 4979     __ ldpd(v10, v11, Address(sp, 16));
 4980     __ ldpd(v8, v9, __ post(sp, 64));
 4981 
 4982     __ ret(lr);
 4983 
 4984     // record the stub entry and end
 4985     store_archive_data(stub_id, start, __ pc());
 4986 
 4987     return start;
 4988   }
 4989 
 4990   // Inputs:
 4991   //   c_rarg0   - long[]  state0
 4992   //   c_rarg1   - long[]  state1
 4993   address generate_double_keccak() {
 4994     StubId stub_id = StubId::stubgen_double_keccak_id;
 4995     int entry_count = StubInfo::entry_count(stub_id);
 4996     assert(entry_count == 1, "sanity check");
 4997     address start = load_archive_data(stub_id);
 4998     if (start != nullptr) {
 4999       return start;
 5000     }
 5001     // Implements the double_keccak() method of the
 5002     // sun.secyrity.provider.SHA3Parallel class
 5003     __ align(CodeEntryAlignment);
 5004     StubCodeMark mark(this, stub_id);
 5005     start = __ pc();
 5006     __ enter();
 5007 
 5008     Register state0        = c_rarg0;
 5009     Register state1        = c_rarg1;
 5010 
 5011     Label rounds24_loop;
 5012 
 5013     // save callee-saved registers
 5014     __ stpd(v8, v9, __ pre(sp, -64));
 5015     __ stpd(v10, v11, Address(sp, 16));
 5016     __ stpd(v12, v13, Address(sp, 32));
 5017     __ stpd(v14, v15, Address(sp, 48));
 5018 
 5019     // load states
 5020     __ add(rscratch1, state0, 32);
 5021     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 5022     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 5023     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 5024     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 5025     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 5026     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 5027     __ ld1(v24, __ D, 0, rscratch1);
 5028     __ add(rscratch1, state1, 32);
 5029     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 5030     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 5031     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 5032     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 5033     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 5034     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 5035     __ ld1(v24, __ D, 1, rscratch1);
 5036 
 5037     // 24 keccak rounds
 5038     __ movw(rscratch2, 24);
 5039 
 5040     // load round_constants base
 5041     __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
 5042 
 5043     __ BIND(rounds24_loop);
 5044     __ subw(rscratch2, rscratch2, 1);
 5045     keccak_round(rscratch1);
 5046     __ cbnzw(rscratch2, rounds24_loop);
 5047 
 5048     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 5049     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 5050     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 5051     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 5052     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 5053     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 5054     __ st1(v24, __ D, 0, state0);
 5055     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 5056     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 5057     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 5058     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 5059     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 5060     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 5061     __ st1(v24, __ D, 1, state1);
 5062 
 5063     // restore callee-saved vector registers
 5064     __ ldpd(v14, v15, Address(sp, 48));
 5065     __ ldpd(v12, v13, Address(sp, 32));
 5066     __ ldpd(v10, v11, Address(sp, 16));
 5067     __ ldpd(v8, v9, __ post(sp, 64));
 5068 
 5069     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5070     __ mov(r0, zr); // return 0
 5071     __ ret(lr);
 5072 
 5073     // record the stub entry and end
 5074     store_archive_data(stub_id, start, __ pc());
 5075 
 5076     return start;
 5077   }
 5078 
 5079   // ChaCha20 block function.  This version parallelizes the 32-bit
 5080   // state elements on each of 16 vectors, producing 4 blocks of
 5081   // keystream at a time.
 5082   //
 5083   // state (int[16]) = c_rarg0
 5084   // keystream (byte[256]) = c_rarg1
 5085   // return - number of bytes of produced keystream (always 256)
 5086   //
 5087   // This implementation takes each 32-bit integer from the state
 5088   // array and broadcasts it across all 4 32-bit lanes of a vector register
 5089   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 5090   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 5091   // the quarter round schedule is implemented as outlined in RFC 7539 section
 5092   // 2.3.  However, instead of sequentially processing the 3 quarter round
 5093   // operations represented by one QUARTERROUND function, we instead stack all
 5094   // the adds, xors and left-rotations from the first 4 quarter rounds together
 5095   // and then do the same for the second set of 4 quarter rounds.  This removes
 5096   // some latency that would otherwise be incurred by waiting for an add to
 5097   // complete before performing an xor (which depends on the result of the
 5098   // add), etc. An adjustment happens between the first and second groups of 4
 5099   // quarter rounds, but this is done only in the inputs to the macro functions
 5100   // that generate the assembly instructions - these adjustments themselves are
 5101   // not part of the resulting assembly.
 5102   // The 4 registers v0-v3 are used during the quarter round operations as
 5103   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 5104   // registers become the vectors involved in adding the start state back onto
 5105   // the post-QR working state.  After the adds are complete, each of the 16
 5106   // vectors write their first lane back to the keystream buffer, followed
 5107   // by the second lane from all vectors and so on.
 5108   address generate_chacha20Block_blockpar() {
 5109     StubId stub_id = StubId::stubgen_chacha20Block_id;
 5110     int entry_count = StubInfo::entry_count(stub_id);
 5111     assert(entry_count == 1, "sanity check");
 5112     address start = load_archive_data(stub_id);
 5113     if (start != nullptr) {
 5114       return start;
 5115     }
 5116     Label L_twoRounds, L_cc20_const;
 5117     __ align(CodeEntryAlignment);
 5118     StubCodeMark mark(this, stub_id);
 5119     start = __ pc();
 5120     __ enter();
 5121 
 5122     int i, j;
 5123     const Register state = c_rarg0;
 5124     const Register keystream = c_rarg1;
 5125     const Register loopCtr = r10;
 5126     const Register tmpAddr = r11;
 5127     const FloatRegister ctrAddOverlay = v28;
 5128     const FloatRegister lrot8Tbl = v29;
 5129 
 5130     // Organize SIMD registers in an array that facilitates
 5131     // putting repetitive opcodes into loop structures.  It is
 5132     // important that each grouping of 4 registers is monotonically
 5133     // increasing to support the requirements of multi-register
 5134     // instructions (e.g. ld4r, st4, etc.)
 5135     const FloatRegister workSt[16] = {
 5136          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 5137         v20, v21, v22, v23, v24, v25, v26, v27
 5138     };
 5139 
 5140     // Pull in constant data.  The first 16 bytes are the add overlay
 5141     // which is applied to the vector holding the counter (state[12]).
 5142     // The second 16 bytes is the index register for the 8-bit left
 5143     // rotation tbl instruction.
 5144     __ adr(tmpAddr, L_cc20_const);
 5145     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 5146 
 5147     // Load from memory and interlace across 16 SIMD registers,
 5148     // With each word from memory being broadcast to all lanes of
 5149     // each successive SIMD register.
 5150     //      Addr(0) -> All lanes in workSt[i]
 5151     //      Addr(4) -> All lanes workSt[i + 1], etc.
 5152     __ mov(tmpAddr, state);
 5153     for (i = 0; i < 16; i += 4) {
 5154       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 5155           __ post(tmpAddr, 16));
 5156     }
 5157     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5158 
 5159     // Before entering the loop, create 5 4-register arrays.  These
 5160     // will hold the 4 registers that represent the a/b/c/d fields
 5161     // in the quarter round operation.  For instance the "b" field
 5162     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 5163     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 5164     // since it is part of a diagonal organization.  The aSet and scratch
 5165     // register sets are defined at declaration time because they do not change
 5166     // organization at any point during the 20-round processing.
 5167     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 5168     FloatRegister bSet[4];
 5169     FloatRegister cSet[4];
 5170     FloatRegister dSet[4];
 5171     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 5172 
 5173     // Set up the 10 iteration loop and perform all 8 quarter round ops
 5174     __ mov(loopCtr, 10);
 5175     __ BIND(L_twoRounds);
 5176 
 5177     // Set to columnar organization and do the following 4 quarter-rounds:
 5178     // QUARTERROUND(0, 4, 8, 12)
 5179     // QUARTERROUND(1, 5, 9, 13)
 5180     // QUARTERROUND(2, 6, 10, 14)
 5181     // QUARTERROUND(3, 7, 11, 15)
 5182     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 5183     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 5184     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 5185 
 5186     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5187     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5188     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5189 
 5190     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5191     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5192     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5193 
 5194     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5195     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5196     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5197 
 5198     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5199     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5200     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5201 
 5202     // Set to diagonal organization and do the next 4 quarter-rounds:
 5203     // QUARTERROUND(0, 5, 10, 15)
 5204     // QUARTERROUND(1, 6, 11, 12)
 5205     // QUARTERROUND(2, 7, 8, 13)
 5206     // QUARTERROUND(3, 4, 9, 14)
 5207     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 5208     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 5209     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 5210 
 5211     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5212     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5213     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5214 
 5215     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5216     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5217     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5218 
 5219     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5220     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5221     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5222 
 5223     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5224     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5225     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5226 
 5227     // Decrement and iterate
 5228     __ sub(loopCtr, loopCtr, 1);
 5229     __ cbnz(loopCtr, L_twoRounds);
 5230 
 5231     __ mov(tmpAddr, state);
 5232 
 5233     // Add the starting state back to the post-loop keystream
 5234     // state.  We read/interlace the state array from memory into
 5235     // 4 registers similar to what we did in the beginning.  Then
 5236     // add the counter overlay onto workSt[12] at the end.
 5237     for (i = 0; i < 16; i += 4) {
 5238       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 5239       __ addv(workSt[i], __ T4S, workSt[i], v0);
 5240       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 5241       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 5242       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 5243     }
 5244     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5245 
 5246     // Write working state into the keystream buffer.  This is accomplished
 5247     // by taking the lane "i" from each of the four vectors and writing
 5248     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 5249     // repeating with the next 4 vectors until all 16 vectors have been used.
 5250     // Then move to the next lane and repeat the process until all lanes have
 5251     // been written.
 5252     for (i = 0; i < 4; i++) {
 5253       for (j = 0; j < 16; j += 4) {
 5254         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 5255             __ post(keystream, 16));
 5256       }
 5257     }
 5258 
 5259     __ mov(r0, 256);             // Return length of output keystream
 5260     __ leave();
 5261     __ ret(lr);
 5262 
 5263     // bind label and generate local constant data used by this stub
 5264     // The constant data is broken into two 128-bit segments to be loaded
 5265     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 5266     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 5267     // The second 128-bits is a table constant used for 8-bit left rotations.
 5268     __ BIND(L_cc20_const);
 5269     __ emit_int64(0x0000000100000000UL);
 5270     __ emit_int64(0x0000000300000002UL);
 5271     __ emit_int64(0x0605040702010003UL);
 5272     __ emit_int64(0x0E0D0C0F0A09080BUL);
 5273 
 5274     // record the stub entry and end
 5275     store_archive_data(stub_id, start, __ pc());
 5276 
 5277     return start;
 5278   }
 5279 
 5280   // Helpers to schedule parallel operation bundles across vector
 5281   // register sequences of size 2, 4 or 8.
 5282 
 5283   // Implement various primitive computations across vector sequences
 5284 
 5285   template<int N>
 5286   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5287                const VSeq<N>& v1, const VSeq<N>& v2) {
 5288     // output must not be constant
 5289     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5290     // output cannot overwrite pending inputs
 5291     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5292     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5293     for (int i = 0; i < N; i++) {
 5294       __ addv(v[i], T, v1[i], v2[i]);
 5295     }
 5296   }
 5297 
 5298   template<int N>
 5299   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5300                const VSeq<N>& v1, const VSeq<N>& v2) {
 5301     // output must not be constant
 5302     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5303     // output cannot overwrite pending inputs
 5304     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5305     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5306     for (int i = 0; i < N; i++) {
 5307       __ subv(v[i], T, v1[i], v2[i]);
 5308     }
 5309   }
 5310 
 5311   template<int N>
 5312   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5313                const VSeq<N>& v1, const VSeq<N>& v2) {
 5314     // output must not be constant
 5315     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5316     // output cannot overwrite pending inputs
 5317     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5318     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5319     for (int i = 0; i < N; i++) {
 5320       __ mulv(v[i], T, v1[i], v2[i]);
 5321     }
 5322   }
 5323 
 5324   template<int N>
 5325   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 5326     // output must not be constant
 5327     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5328     // output cannot overwrite pending inputs
 5329     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5330     for (int i = 0; i < N; i++) {
 5331       __ negr(v[i], T, v1[i]);
 5332     }
 5333   }
 5334 
 5335   template<int N>
 5336   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5337                const VSeq<N>& v1, int shift) {
 5338     // output must not be constant
 5339     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5340     // output cannot overwrite pending inputs
 5341     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5342     for (int i = 0; i < N; i++) {
 5343       __ sshr(v[i], T, v1[i], shift);
 5344     }
 5345   }
 5346 
 5347   template<int N>
 5348   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5349     // output must not be constant
 5350     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5351     // output cannot overwrite pending inputs
 5352     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5353     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5354     for (int i = 0; i < N; i++) {
 5355       __ andr(v[i], __ T16B, v1[i], v2[i]);
 5356     }
 5357   }
 5358 
 5359   template<int N>
 5360   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5361     // output must not be constant
 5362     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5363     // output cannot overwrite pending inputs
 5364     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5365     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5366     for (int i = 0; i < N; i++) {
 5367       __ orr(v[i], __ T16B, v1[i], v2[i]);
 5368     }
 5369   }
 5370 
 5371   template<int N>
 5372   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 5373     // output must not be constant
 5374     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5375     // output cannot overwrite pending inputs
 5376     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5377     for (int i = 0; i < N; i++) {
 5378       __ notr(v[i], __ T16B, v1[i]);
 5379     }
 5380   }
 5381 
 5382   template<int N>
 5383   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 5384     // output must not be constant
 5385     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5386     // output cannot overwrite pending inputs
 5387     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5388     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5389     for (int i = 0; i < N; i++) {
 5390       __ sqdmulh(v[i], T, v1[i], v2[i]);
 5391     }
 5392   }
 5393 
 5394   template<int N>
 5395   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 5396     // output must not be constant
 5397     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5398     // output cannot overwrite pending inputs
 5399     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5400     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5401     for (int i = 0; i < N; i++) {
 5402       __ mlsv(v[i], T, v1[i], v2[i]);
 5403     }
 5404   }
 5405 
 5406   // load N/2 successive pairs of quadword values from memory in order
 5407   // into N successive vector registers of the sequence via the
 5408   // address supplied in base.
 5409   template<int N>
 5410   void vs_ldpq(const VSeq<N>& v, Register base) {
 5411     for (int i = 0; i < N; i += 2) {
 5412       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 5413     }
 5414   }
 5415 
 5416   // load N/2 successive pairs of quadword values from memory in order
 5417   // into N vector registers of the sequence via the address supplied
 5418   // in base using post-increment addressing
 5419   template<int N>
 5420   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 5421     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5422     for (int i = 0; i < N; i += 2) {
 5423       __ ldpq(v[i], v[i+1], __ post(base, 32));
 5424     }
 5425   }
 5426 
 5427   // store N successive vector registers of the sequence into N/2
 5428   // successive pairs of quadword memory locations via the address
 5429   // supplied in base using post-increment addressing
 5430   template<int N>
 5431   void vs_stpq_post(const VSeq<N>& v, Register base) {
 5432     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5433     for (int i = 0; i < N; i += 2) {
 5434       __ stpq(v[i], v[i+1], __ post(base, 32));
 5435     }
 5436   }
 5437 
 5438   // load N/2 pairs of quadword values from memory de-interleaved into
 5439   // N vector registers 2 at a time via the address supplied in base
 5440   // using post-increment addressing.
 5441   template<int N>
 5442   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5443     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5444     for (int i = 0; i < N; i += 2) {
 5445       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 5446     }
 5447   }
 5448 
 5449   // store N vector registers interleaved into N/2 pairs of quadword
 5450   // memory locations via the address supplied in base using
 5451   // post-increment addressing.
 5452   template<int N>
 5453   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5454     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5455     for (int i = 0; i < N; i += 2) {
 5456       __ st2(v[i], v[i+1], T, __ post(base, 32));
 5457     }
 5458   }
 5459 
 5460   // load N quadword values from memory de-interleaved into N vector
 5461   // registers 3 elements at a time via the address supplied in base.
 5462   template<int N>
 5463   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5464     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5465     for (int i = 0; i < N; i += 3) {
 5466       __ ld3(v[i], v[i+1], v[i+2], T, base);
 5467     }
 5468   }
 5469 
 5470   // load N quadword values from memory de-interleaved into N vector
 5471   // registers 3 elements at a time via the address supplied in base
 5472   // using post-increment addressing.
 5473   template<int N>
 5474   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5475     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5476     for (int i = 0; i < N; i += 3) {
 5477       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 5478     }
 5479   }
 5480 
 5481   // load N/2 pairs of quadword values from memory into N vector
 5482   // registers via the address supplied in base with each pair indexed
 5483   // using the the start offset plus the corresponding entry in the
 5484   // offsets array
 5485   template<int N>
 5486   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 5487     for (int i = 0; i < N/2; i++) {
 5488       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5489     }
 5490   }
 5491 
 5492   // store N vector registers into N/2 pairs of quadword memory
 5493   // locations via the address supplied in base with each pair indexed
 5494   // using the the start offset plus the corresponding entry in the
 5495   // offsets array
 5496   template<int N>
 5497   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 5498     for (int i = 0; i < N/2; i++) {
 5499       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5500     }
 5501   }
 5502 
 5503   // load N single quadword values from memory into N vector registers
 5504   // via the address supplied in base with each value indexed using
 5505   // the the start offset plus the corresponding entry in the offsets
 5506   // array
 5507   template<int N>
 5508   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5509                       int start, int (&offsets)[N]) {
 5510     for (int i = 0; i < N; i++) {
 5511       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5512     }
 5513   }
 5514 
 5515   // store N vector registers into N single quadword memory locations
 5516   // via the address supplied in base with each value indexed using
 5517   // the the start offset plus the corresponding entry in the offsets
 5518   // array
 5519   template<int N>
 5520   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5521                       int start, int (&offsets)[N]) {
 5522     for (int i = 0; i < N; i++) {
 5523       __ str(v[i], T, Address(base, start + offsets[i]));
 5524     }
 5525   }
 5526 
 5527   // load N/2 pairs of quadword values from memory de-interleaved into
 5528   // N vector registers 2 at a time via the address supplied in base
 5529   // with each pair indexed using the the start offset plus the
 5530   // corresponding entry in the offsets array
 5531   template<int N>
 5532   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5533                       Register tmp, int start, int (&offsets)[N/2]) {
 5534     for (int i = 0; i < N/2; i++) {
 5535       __ add(tmp, base, start + offsets[i]);
 5536       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5537     }
 5538   }
 5539 
 5540   // store N vector registers 2 at a time interleaved into N/2 pairs
 5541   // of quadword memory locations via the address supplied in base
 5542   // with each pair indexed using the the start offset plus the
 5543   // corresponding entry in the offsets array
 5544   template<int N>
 5545   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5546                       Register tmp, int start, int (&offsets)[N/2]) {
 5547     for (int i = 0; i < N/2; i++) {
 5548       __ add(tmp, base, start + offsets[i]);
 5549       __ st2(v[2*i], v[2*i+1], T, tmp);
 5550     }
 5551   }
 5552 
 5553   // Helper routines for various flavours of Montgomery multiply
 5554 
 5555   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5556   // multiplications in parallel
 5557   //
 5558 
 5559   // See the montMul() method of the sun.security.provider.ML_DSA
 5560   // class.
 5561   //
 5562   // Computes 4x4S results or 8x8H results
 5563   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5564   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5565   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5566   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5567   // Outputs: va - 4x4S or 4x8H vector register sequences
 5568   // vb, vc, vtmp and vq must all be disjoint
 5569   // va must be disjoint from all other inputs/temps or must equal vc
 5570   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5571   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5572   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5573                    Assembler::SIMD_Arrangement T,
 5574                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5575     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5576     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5577     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5578     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5579 
 5580     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5581     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5582 
 5583     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5584 
 5585     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5586     assert(vs_disjoint(va, vb), "va and vb overlap");
 5587     assert(vs_disjoint(va, vq), "va and vq overlap");
 5588     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5589     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5590 
 5591     // schedule 4 streams of instructions across the vector sequences
 5592     for (int i = 0; i < 4; i++) {
 5593       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5594       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5595     }
 5596 
 5597     for (int i = 0; i < 4; i++) {
 5598       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5599     }
 5600 
 5601     for (int i = 0; i < 4; i++) {
 5602       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5603     }
 5604 
 5605     for (int i = 0; i < 4; i++) {
 5606       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5607     }
 5608   }
 5609 
 5610   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5611   // multiplications in parallel
 5612   //
 5613 
 5614   // See the montMul() method of the sun.security.provider.ML_DSA
 5615   // class.
 5616   //
 5617   // Computes 4x4S results or 8x8H results
 5618   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5619   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5620   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5621   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5622   // Outputs: va - 4x4S or 4x8H vector register sequences
 5623   // vb, vc, vtmp and vq must all be disjoint
 5624   // va must be disjoint from all other inputs/temps or must equal vc
 5625   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5626   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5627   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5628                    Assembler::SIMD_Arrangement T,
 5629                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5630     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5631     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5632     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5633     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5634 
 5635     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5636     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5637 
 5638     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5639 
 5640     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5641     assert(vs_disjoint(va, vb), "va and vb overlap");
 5642     assert(vs_disjoint(va, vq), "va and vq overlap");
 5643     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5644     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5645 
 5646     // schedule 2 streams of instructions across the vector sequences
 5647     for (int i = 0; i < 2; i++) {
 5648       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5649       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5650     }
 5651 
 5652     for (int i = 0; i < 2; i++) {
 5653       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5654     }
 5655 
 5656     for (int i = 0; i < 2; i++) {
 5657       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5658     }
 5659 
 5660     for (int i = 0; i < 2; i++) {
 5661       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5662     }
 5663   }
 5664 
 5665   // Perform 16 16-bit Montgomery multiplications in parallel.
 5666   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5667                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5668     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5669     // It will assert that the register use is valid
 5670     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5671   }
 5672 
 5673   // Perform 32 16-bit Montgomery multiplications in parallel.
 5674   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5675                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5676     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5677     // It will assert that the register use is valid
 5678     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5679   }
 5680 
 5681   // Perform 64 16-bit Montgomery multiplications in parallel.
 5682   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5683                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5684     // Schedule two successive 4x8H multiplies via the montmul helper
 5685     // on the front and back halves of va, vb and vc. The helper will
 5686     // assert that the register use has no overlap conflicts on each
 5687     // individual call but we also need to ensure that the necessary
 5688     // disjoint/equality constraints are met across both calls.
 5689 
 5690     // vb, vc, vtmp and vq must be disjoint. va must either be
 5691     // disjoint from all other registers or equal vc
 5692 
 5693     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5694     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5695     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5696 
 5697     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5698     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5699 
 5700     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5701 
 5702     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5703     assert(vs_disjoint(va, vb), "va and vb overlap");
 5704     assert(vs_disjoint(va, vq), "va and vq overlap");
 5705     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5706 
 5707     // we multiply the front and back halves of each sequence 4 at a
 5708     // time because
 5709     //
 5710     // 1) we are currently only able to get 4-way instruction
 5711     // parallelism at best
 5712     //
 5713     // 2) we need registers for the constants in vq and temporary
 5714     // scratch registers to hold intermediate results so vtmp can only
 5715     // be a VSeq<4> which means we only have 4 scratch slots
 5716 
 5717     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5718     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5719   }
 5720 
 5721   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5722                                const VSeq<4>& vc,
 5723                                const VSeq<4>& vtmp,
 5724                                const VSeq<2>& vq) {
 5725     // compute a = montmul(a1, c)
 5726     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5727     // ouptut a1 = a0 - a
 5728     vs_subv(va1, __ T8H, va0, vc);
 5729     //    and a0 = a0 + a
 5730     vs_addv(va0, __ T8H, va0, vc);
 5731   }
 5732 
 5733   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5734                                const VSeq<4>& vb,
 5735                                const VSeq<4>& vtmp1,
 5736                                const VSeq<4>& vtmp2,
 5737                                const VSeq<2>& vq) {
 5738     // compute c = a0 - a1
 5739     vs_subv(vtmp1, __ T8H, va0, va1);
 5740     // output a0 = a0 + a1
 5741     vs_addv(va0, __ T8H, va0, va1);
 5742     // output a1 = b montmul c
 5743     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5744   }
 5745 
 5746   void load64shorts(const VSeq<8>& v, Register shorts) {
 5747     vs_ldpq_post(v, shorts);
 5748   }
 5749 
 5750   void load32shorts(const VSeq<4>& v, Register shorts) {
 5751     vs_ldpq_post(v, shorts);
 5752   }
 5753 
 5754   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5755     vs_stpq_post(v, tmpAddr);
 5756   }
 5757 
 5758   // Kyber NTT function.
 5759   // Implements
 5760   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5761   //
 5762   // coeffs (short[256]) = c_rarg0
 5763   // ntt_zetas (short[256]) = c_rarg1
 5764   address generate_kyberNtt() {
 5765     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5766     int entry_count = StubInfo::entry_count(stub_id);
 5767     assert(entry_count == 1, "sanity check");
 5768     address start = load_archive_data(stub_id);
 5769     if (start != nullptr) {
 5770       return start;
 5771     }
 5772     __ align(CodeEntryAlignment);
 5773     StubCodeMark mark(this, stub_id);
 5774     start = __ pc();
 5775     __ enter();
 5776 
 5777     const Register coeffs = c_rarg0;
 5778     const Register zetas = c_rarg1;
 5779 
 5780     const Register kyberConsts = r10;
 5781     const Register tmpAddr = r11;
 5782 
 5783     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5784     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5785     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5786 
 5787     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5788     // load the montmul constants
 5789     vs_ldpq(vq, kyberConsts);
 5790 
 5791     // Each level corresponds to an iteration of the outermost loop of the
 5792     // Java method seilerNTT(int[] coeffs). There are some differences
 5793     // from what is done in the seilerNTT() method, though:
 5794     // 1. The computation is using 16-bit signed values, we do not convert them
 5795     // to ints here.
 5796     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5797     // this array for each level, it is easier that way to fill up the vector
 5798     // registers.
 5799     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5800     // multiplications (this is because that way there should not be any
 5801     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5802     // that we can use the 16-bit arithmetic in the vector unit.
 5803     //
 5804     // On each level, we fill up the vector registers in such a way that the
 5805     // array elements that need to be multiplied by the zetas go into one
 5806     // set of vector registers while the corresponding ones that don't need to
 5807     // be multiplied, go into another set.
 5808     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5809     // registers interleaving the steps of 4 identical computations,
 5810     // each done on 8 16-bit values per register.
 5811 
 5812     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5813     // to the zetas occur in discrete blocks whose size is some multiple
 5814     // of 32.
 5815 
 5816     // level 0
 5817     __ add(tmpAddr, coeffs, 256);
 5818     load64shorts(vs1, tmpAddr);
 5819     load64shorts(vs2, zetas);
 5820     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5821     __ add(tmpAddr, coeffs, 0);
 5822     load64shorts(vs1, tmpAddr);
 5823     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5824     vs_addv(vs1, __ T8H, vs1, vs2);
 5825     __ add(tmpAddr, coeffs, 0);
 5826     vs_stpq_post(vs1, tmpAddr);
 5827     __ add(tmpAddr, coeffs, 256);
 5828     vs_stpq_post(vs3, tmpAddr);
 5829     // restore montmul constants
 5830     vs_ldpq(vq, kyberConsts);
 5831     load64shorts(vs1, tmpAddr);
 5832     load64shorts(vs2, zetas);
 5833     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5834     __ add(tmpAddr, coeffs, 128);
 5835     load64shorts(vs1, tmpAddr);
 5836     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5837     vs_addv(vs1, __ T8H, vs1, vs2);
 5838     __ add(tmpAddr, coeffs, 128);
 5839     store64shorts(vs1, tmpAddr);
 5840     __ add(tmpAddr, coeffs, 384);
 5841     store64shorts(vs3, tmpAddr);
 5842 
 5843     // level 1
 5844     // restore montmul constants
 5845     vs_ldpq(vq, kyberConsts);
 5846     __ add(tmpAddr, coeffs, 128);
 5847     load64shorts(vs1, tmpAddr);
 5848     load64shorts(vs2, zetas);
 5849     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5850     __ add(tmpAddr, coeffs, 0);
 5851     load64shorts(vs1, tmpAddr);
 5852     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5853     vs_addv(vs1, __ T8H, vs1, vs2);
 5854     __ add(tmpAddr, coeffs, 0);
 5855     store64shorts(vs1, tmpAddr);
 5856     store64shorts(vs3, tmpAddr);
 5857     vs_ldpq(vq, kyberConsts);
 5858     __ add(tmpAddr, coeffs, 384);
 5859     load64shorts(vs1, tmpAddr);
 5860     load64shorts(vs2, zetas);
 5861     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5862     __ add(tmpAddr, coeffs, 256);
 5863     load64shorts(vs1, tmpAddr);
 5864     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5865     vs_addv(vs1, __ T8H, vs1, vs2);
 5866     __ add(tmpAddr, coeffs, 256);
 5867     store64shorts(vs1, tmpAddr);
 5868     store64shorts(vs3, tmpAddr);
 5869 
 5870     // level 2
 5871     vs_ldpq(vq, kyberConsts);
 5872     int offsets1[4] = { 0, 32, 128, 160 };
 5873     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5874     load64shorts(vs2, zetas);
 5875     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5876     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5877     // kyber_subv_addv64();
 5878     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5879     vs_addv(vs1, __ T8H, vs1, vs2);
 5880     __ add(tmpAddr, coeffs, 0);
 5881     vs_stpq_post(vs_front(vs1), tmpAddr);
 5882     vs_stpq_post(vs_front(vs3), tmpAddr);
 5883     vs_stpq_post(vs_back(vs1), tmpAddr);
 5884     vs_stpq_post(vs_back(vs3), tmpAddr);
 5885     vs_ldpq(vq, kyberConsts);
 5886     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5887     load64shorts(vs2, zetas);
 5888     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5889     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5890     // kyber_subv_addv64();
 5891     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5892     vs_addv(vs1, __ T8H, vs1, vs2);
 5893     __ add(tmpAddr, coeffs, 256);
 5894     vs_stpq_post(vs_front(vs1), tmpAddr);
 5895     vs_stpq_post(vs_front(vs3), tmpAddr);
 5896     vs_stpq_post(vs_back(vs1), tmpAddr);
 5897     vs_stpq_post(vs_back(vs3), tmpAddr);
 5898 
 5899     // level 3
 5900     vs_ldpq(vq, kyberConsts);
 5901     int offsets2[4] = { 0, 64, 128, 192 };
 5902     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5903     load64shorts(vs2, zetas);
 5904     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5905     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5906     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5907     vs_addv(vs1, __ T8H, vs1, vs2);
 5908     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5909     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5910 
 5911     vs_ldpq(vq, kyberConsts);
 5912     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5913     load64shorts(vs2, zetas);
 5914     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5915     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5916     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5917     vs_addv(vs1, __ T8H, vs1, vs2);
 5918     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5919     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5920 
 5921     // level 4
 5922     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5923     // so they are loaded using employing an ldr at 8 distinct offsets.
 5924 
 5925     vs_ldpq(vq, kyberConsts);
 5926     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5927     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5928     load64shorts(vs2, zetas);
 5929     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5930     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5931     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5932     vs_addv(vs1, __ T8H, vs1, vs2);
 5933     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5934     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5935 
 5936     vs_ldpq(vq, kyberConsts);
 5937     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5938     load64shorts(vs2, zetas);
 5939     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5940     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5941     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5942     vs_addv(vs1, __ T8H, vs1, vs2);
 5943     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5944     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5945 
 5946     // level 5
 5947     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5948     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5949 
 5950     vs_ldpq(vq, kyberConsts);
 5951     int offsets4[4] = { 0, 32, 64, 96 };
 5952     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5953     load32shorts(vs_front(vs2), zetas);
 5954     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5955     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5956     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5957     load32shorts(vs_front(vs2), zetas);
 5958     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5959     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5960     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5961     load32shorts(vs_front(vs2), zetas);
 5962     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5963     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5964 
 5965     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5966     load32shorts(vs_front(vs2), zetas);
 5967     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5968     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5969 
 5970     // level 6
 5971     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5972     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5973 
 5974     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5975     load32shorts(vs_front(vs2), zetas);
 5976     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5977     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5978     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5979     // __ ldpq(v18, v19, __ post(zetas, 32));
 5980     load32shorts(vs_front(vs2), zetas);
 5981     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5982     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5983 
 5984     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5985     load32shorts(vs_front(vs2), zetas);
 5986     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5987     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5988 
 5989     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5990     load32shorts(vs_front(vs2), zetas);
 5991     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5992     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5993 
 5994     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5995     __ mov(r0, zr); // return 0
 5996     __ ret(lr);
 5997 
 5998     // record the stub entry and end
 5999     store_archive_data(stub_id, start, __ pc());
 6000 
 6001     return start;
 6002   }
 6003 
 6004   // Kyber Inverse NTT function
 6005   // Implements
 6006   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 6007   //
 6008   // coeffs (short[256]) = c_rarg0
 6009   // ntt_zetas (short[256]) = c_rarg1
 6010   address generate_kyberInverseNtt() {
 6011     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 6012     int entry_count = StubInfo::entry_count(stub_id);
 6013     assert(entry_count == 1, "sanity check");
 6014     address start = load_archive_data(stub_id);
 6015     if (start != nullptr) {
 6016       return start;
 6017     }
 6018     __ align(CodeEntryAlignment);
 6019     StubCodeMark mark(this, stub_id);
 6020     start = __ pc();
 6021     __ enter();
 6022 
 6023     const Register coeffs = c_rarg0;
 6024     const Register zetas = c_rarg1;
 6025 
 6026     const Register kyberConsts = r10;
 6027     const Register tmpAddr = r11;
 6028     const Register tmpAddr2 = c_rarg2;
 6029 
 6030     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 6031     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6032     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6033 
 6034     __ lea(kyberConsts,
 6035              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6036 
 6037     // level 0
 6038     // At level 0 related coefficients occur in discrete blocks of size 4 so
 6039     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 6040 
 6041     vs_ldpq(vq, kyberConsts);
 6042     int offsets4[4] = { 0, 32, 64, 96 };
 6043     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6044     load32shorts(vs_front(vs2), zetas);
 6045     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6046                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6047     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6048     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6049     load32shorts(vs_front(vs2), zetas);
 6050     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6051                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6052     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6053     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6054     load32shorts(vs_front(vs2), zetas);
 6055     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6056                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6057     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6058     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6059     load32shorts(vs_front(vs2), zetas);
 6060     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6061                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6062     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6063 
 6064     // level 1
 6065     // At level 1 related coefficients occur in discrete blocks of size 8 so
 6066     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 6067 
 6068     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6069     load32shorts(vs_front(vs2), zetas);
 6070     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6071                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6072     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6073     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6074     load32shorts(vs_front(vs2), zetas);
 6075     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6076                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6077     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6078 
 6079     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6080     load32shorts(vs_front(vs2), zetas);
 6081     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6082                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6083     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6084     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6085     load32shorts(vs_front(vs2), zetas);
 6086     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6087                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6088     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6089 
 6090     // level 2
 6091     // At level 2 coefficients occur in 8 discrete blocks of size 16
 6092     // so they are loaded using employing an ldr at 8 distinct offsets.
 6093 
 6094     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6095     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6096     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6097     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6098     vs_subv(vs1, __ T8H, vs1, vs2);
 6099     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 6100     load64shorts(vs2, zetas);
 6101     vs_ldpq(vq, kyberConsts);
 6102     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6103     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6104 
 6105     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6106     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6107     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6108     vs_subv(vs1, __ T8H, vs1, vs2);
 6109     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 6110     load64shorts(vs2, zetas);
 6111     vs_ldpq(vq, kyberConsts);
 6112     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6113     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6114 
 6115     // Barrett reduction at indexes where overflow may happen
 6116 
 6117     // load q and the multiplier for the Barrett reduction
 6118     __ add(tmpAddr, kyberConsts, 16);
 6119     vs_ldpq(vq, tmpAddr);
 6120 
 6121     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 6122     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 6123     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 6124     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6125     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6126     vs_sshr(vs2, __ T8H, vs2, 11);
 6127     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6128     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6129     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6130     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6131     vs_sshr(vs2, __ T8H, vs2, 11);
 6132     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6133     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6134 
 6135     // level 3
 6136     // From level 3 upwards coefficients occur in discrete blocks whose size is
 6137     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 6138 
 6139     int offsets2[4] = { 0, 64, 128, 192 };
 6140     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 6141     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 6142     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6143     vs_subv(vs1, __ T8H, vs1, vs2);
 6144     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 6145     load64shorts(vs2, zetas);
 6146     vs_ldpq(vq, kyberConsts);
 6147     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6148     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 6149 
 6150     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 6151     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6152     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6153     vs_subv(vs1, __ T8H, vs1, vs2);
 6154     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 6155     load64shorts(vs2, zetas);
 6156     vs_ldpq(vq, kyberConsts);
 6157     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6158     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6159 
 6160     // level 4
 6161 
 6162     int offsets1[4] = { 0, 32, 128, 160 };
 6163     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 6164     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 6165     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6166     vs_subv(vs1, __ T8H, vs1, vs2);
 6167     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 6168     load64shorts(vs2, zetas);
 6169     vs_ldpq(vq, kyberConsts);
 6170     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6171     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 6172 
 6173     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 6174     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6175     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6176     vs_subv(vs1, __ T8H, vs1, vs2);
 6177     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 6178     load64shorts(vs2, zetas);
 6179     vs_ldpq(vq, kyberConsts);
 6180     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6181     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6182 
 6183     // level 5
 6184 
 6185     __ add(tmpAddr, coeffs, 0);
 6186     load64shorts(vs1, tmpAddr);
 6187     __ add(tmpAddr, coeffs, 128);
 6188     load64shorts(vs2, tmpAddr);
 6189     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6190     vs_subv(vs1, __ T8H, vs1, vs2);
 6191     __ add(tmpAddr, coeffs, 0);
 6192     store64shorts(vs3, tmpAddr);
 6193     load64shorts(vs2, zetas);
 6194     vs_ldpq(vq, kyberConsts);
 6195     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6196     __ add(tmpAddr, coeffs, 128);
 6197     store64shorts(vs2, tmpAddr);
 6198 
 6199     load64shorts(vs1, tmpAddr);
 6200     __ add(tmpAddr, coeffs, 384);
 6201     load64shorts(vs2, tmpAddr);
 6202     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6203     vs_subv(vs1, __ T8H, vs1, vs2);
 6204     __ add(tmpAddr, coeffs, 256);
 6205     store64shorts(vs3, tmpAddr);
 6206     load64shorts(vs2, zetas);
 6207     vs_ldpq(vq, kyberConsts);
 6208     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6209     __ add(tmpAddr, coeffs, 384);
 6210     store64shorts(vs2, tmpAddr);
 6211 
 6212     // Barrett reduction at indexes where overflow may happen
 6213 
 6214     // load q and the multiplier for the Barrett reduction
 6215     __ add(tmpAddr, kyberConsts, 16);
 6216     vs_ldpq(vq, tmpAddr);
 6217 
 6218     int offsets0[2] = { 0, 256 };
 6219     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6220     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6221     vs_sshr(vs2, __ T8H, vs2, 11);
 6222     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6223     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6224 
 6225     // level 6
 6226 
 6227     __ add(tmpAddr, coeffs, 0);
 6228     load64shorts(vs1, tmpAddr);
 6229     __ add(tmpAddr, coeffs, 256);
 6230     load64shorts(vs2, tmpAddr);
 6231     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6232     vs_subv(vs1, __ T8H, vs1, vs2);
 6233     __ add(tmpAddr, coeffs, 0);
 6234     store64shorts(vs3, tmpAddr);
 6235     load64shorts(vs2, zetas);
 6236     vs_ldpq(vq, kyberConsts);
 6237     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6238     __ add(tmpAddr, coeffs, 256);
 6239     store64shorts(vs2, tmpAddr);
 6240 
 6241     __ add(tmpAddr, coeffs, 128);
 6242     load64shorts(vs1, tmpAddr);
 6243     __ add(tmpAddr, coeffs, 384);
 6244     load64shorts(vs2, tmpAddr);
 6245     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6246     vs_subv(vs1, __ T8H, vs1, vs2);
 6247     __ add(tmpAddr, coeffs, 128);
 6248     store64shorts(vs3, tmpAddr);
 6249     load64shorts(vs2, zetas);
 6250     vs_ldpq(vq, kyberConsts);
 6251     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6252     __ add(tmpAddr, coeffs, 384);
 6253     store64shorts(vs2, tmpAddr);
 6254 
 6255     // multiply by 2^-n
 6256 
 6257     // load toMont(2^-n mod q)
 6258     __ add(tmpAddr, kyberConsts, 48);
 6259     __ ldr(v29, __ Q, tmpAddr);
 6260 
 6261     vs_ldpq(vq, kyberConsts);
 6262     __ add(tmpAddr, coeffs, 0);
 6263     load64shorts(vs1, tmpAddr);
 6264     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6265     __ add(tmpAddr, coeffs, 0);
 6266     store64shorts(vs2, tmpAddr);
 6267 
 6268     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 6269     load64shorts(vs1, tmpAddr);
 6270     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6271     __ add(tmpAddr, coeffs, 128);
 6272     store64shorts(vs2, tmpAddr);
 6273 
 6274     // now tmpAddr contains coeffs + 256
 6275     load64shorts(vs1, tmpAddr);
 6276     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6277     __ add(tmpAddr, coeffs, 256);
 6278     store64shorts(vs2, tmpAddr);
 6279 
 6280     // now tmpAddr contains coeffs + 384
 6281     load64shorts(vs1, tmpAddr);
 6282     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6283     __ add(tmpAddr, coeffs, 384);
 6284     store64shorts(vs2, tmpAddr);
 6285 
 6286     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6287     __ mov(r0, zr); // return 0
 6288     __ ret(lr);
 6289 
 6290     // record the stub entry and end
 6291     store_archive_data(stub_id, start, __ pc());
 6292 
 6293     return start;
 6294   }
 6295 
 6296   // Kyber multiply polynomials in the NTT domain.
 6297   // Implements
 6298   // static int implKyberNttMult(
 6299   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 6300   //
 6301   // The actual algorithm that is used here differs from the one in the Java
 6302   // implementation, it uses Montgomery multiplications instead of Barrett
 6303   // reduction, but the end result modulo MLKEM_Q is the same. This is the
 6304   // Java equivalent of this intrinsic implementation:
 6305   // static void implKyberNttMultJava(short[] result, short[] ntta, short[] nttb) {
 6306   //         for (int m = 0; m < ML_KEM_N / 2; m++) {
 6307   //             int a0 = ntta[2 * m];
 6308   //             int a1 = ntta[2 * m + 1];
 6309   //             int b0 = nttb[2 * m];
 6310   //             int b1 = nttb[2 * m + 1];
 6311   //             int r = montMul(a0, b0) +
 6312   //                     montMul(montMul(a1, b1), MONT_ZETAS_FOR_NTT_MULT[m]);
 6313   //             result[2 * m] = (short) montMul(r, MONT_R_SQUARE_MOD_Q);
 6314   //             result[2 * m + 1] = (short) montMul(
 6315   //                     (montMul(a0, b1) + montMul(a1, b0)), MONT_R_SQUARE_MOD_Q);
 6316   //          }
 6317   // }
 6318   //
 6319   // result (short[256]) = c_rarg0
 6320   // ntta (short[256]) = c_rarg1
 6321   // nttb (short[256]) = c_rarg2
 6322   // zetas (short[128]) = c_rarg3
 6323   address generate_kyberNttMult() {
 6324     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 6325     int entry_count = StubInfo::entry_count(stub_id);
 6326     assert(entry_count == 1, "sanity check");
 6327     address start = load_archive_data(stub_id);
 6328     if (start != nullptr) {
 6329       return start;
 6330     }
 6331     __ align(CodeEntryAlignment);
 6332     StubCodeMark mark(this, stub_id);
 6333     start = __ pc();
 6334     __ enter();
 6335 
 6336     const Register result = c_rarg0;
 6337     const Register ntta = c_rarg1;
 6338     const Register nttb = c_rarg2;
 6339     const Register zetas = c_rarg3;
 6340 
 6341     const Register kyberConsts = r10;
 6342     const Register limit = r11;
 6343 
 6344     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 6345     VSeq<4> vs3(16), vs4(20);
 6346     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 6347     VSeq<2> vz(28);          // pair of zetas
 6348     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 6349 
 6350     __ lea(kyberConsts,
 6351              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6352 
 6353     Label kyberNttMult_loop;
 6354 
 6355     __ add(limit, result, 512);
 6356 
 6357     // load q and qinv
 6358     vs_ldpq(vq, kyberConsts);
 6359 
 6360     // load R^2 mod q (to convert back from Montgomery representation)
 6361     __ add(kyberConsts, kyberConsts, 64);
 6362     __ ldr(v27, __ Q, kyberConsts);
 6363 
 6364     __ BIND(kyberNttMult_loop);
 6365 
 6366     // load 16 zetas
 6367     vs_ldpq_post(vz, zetas);
 6368 
 6369     // load 2 sets of 32 coefficients from the two input arrays
 6370     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 6371     // are striped across pairs of vector registers
 6372     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 6373     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 6374     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 6375     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 6376 
 6377     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 6378     // i.e. montmul the first and second halves of vs1 in order and
 6379     // then with one sequence reversed storing the two results in vs3
 6380     //
 6381     // vs3[0] <- montmul(a0, b0)
 6382     // vs3[1] <- montmul(a1, b1)
 6383     // vs3[2] <- montmul(a0, b1)
 6384     // vs3[3] <- montmul(a1, b0)
 6385     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 6386     kyber_montmul16(vs_back(vs3),
 6387                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 6388 
 6389     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 6390     // i.e. montmul the first and second halves of vs4 in order and
 6391     // then with one sequence reversed storing the two results in vs1
 6392     //
 6393     // vs1[0] <- montmul(a2, b2)
 6394     // vs1[1] <- montmul(a3, b3)
 6395     // vs1[2] <- montmul(a2, b3)
 6396     // vs1[3] <- montmul(a3, b2)
 6397     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 6398     kyber_montmul16(vs_back(vs1),
 6399                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 6400 
 6401     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 6402     // We can schedule two montmuls at a time if we use a suitable vector
 6403     // sequence <vs3[1], vs1[1]>.
 6404     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 6405     VSeq<2> vs5(vs3[1], delta);
 6406 
 6407     // vs3[1] <- montmul(montmul(a1, b1), z0)
 6408     // vs1[1] <- montmul(montmul(a3, b3), z1)
 6409     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 6410 
 6411     // add results in pairs storing in vs3
 6412     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 6413     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 6414     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 6415 
 6416     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 6417     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 6418     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 6419 
 6420     // vs1 <- montmul(vs3, montRSquareModQ)
 6421     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 6422 
 6423     // store back the two pairs of result vectors de-interleaved as 8H elements
 6424     // i.e. storing each pairs of shorts striped across a register pair adjacent
 6425     // in memory
 6426     vs_st2_post(vs1, __ T8H, result);
 6427 
 6428     __ cmp(result, limit);
 6429     __ br(Assembler::NE, kyberNttMult_loop);
 6430 
 6431     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6432     __ mov(r0, zr); // return 0
 6433     __ ret(lr);
 6434 
 6435     // record the stub entry and end
 6436     store_archive_data(stub_id, start, __ pc());
 6437 
 6438     return start;
 6439   }
 6440 
 6441   // Kyber add 2 polynomials.
 6442   // Implements
 6443   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 6444   //
 6445   // result (short[256]) = c_rarg0
 6446   // a (short[256]) = c_rarg1
 6447   // b (short[256]) = c_rarg2
 6448   address generate_kyberAddPoly_2() {
 6449     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 6450     int entry_count = StubInfo::entry_count(stub_id);
 6451     assert(entry_count == 1, "sanity check");
 6452     address start = load_archive_data(stub_id);
 6453     if (start != nullptr) {
 6454       return start;
 6455     }
 6456     __ align(CodeEntryAlignment);
 6457     StubCodeMark mark(this, stub_id);
 6458     start = __ pc();
 6459     __ enter();
 6460 
 6461     const Register result = c_rarg0;
 6462     const Register a = c_rarg1;
 6463     const Register b = c_rarg2;
 6464 
 6465     const Register kyberConsts = r11;
 6466 
 6467     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 6468     // So, we can load, add and store the data in 3 groups of 11,
 6469     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 6470     // registers. A further constraint is that the mapping needs
 6471     // to skip callee saves. So, we allocate the register
 6472     // sequences using two 8 sequences, two 2 sequences and two
 6473     // single registers.
 6474     VSeq<8> vs1_1(0);
 6475     VSeq<2> vs1_2(16);
 6476     FloatRegister vs1_3 = v28;
 6477     VSeq<8> vs2_1(18);
 6478     VSeq<2> vs2_2(26);
 6479     FloatRegister vs2_3 = v29;
 6480 
 6481     // two constant vector sequences
 6482     VSeq<8> vc_1(31, 0);
 6483     VSeq<2> vc_2(31, 0);
 6484 
 6485     FloatRegister vc_3 = v31;
 6486     __ lea(kyberConsts,
 6487              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6488 
 6489     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6490     for (int i = 0; i < 3; i++) {
 6491       // load 80 or 88 values from a into vs1_1/2/3
 6492       vs_ldpq_post(vs1_1, a);
 6493       vs_ldpq_post(vs1_2, a);
 6494       if (i < 2) {
 6495         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6496       }
 6497       // load 80 or 88 values from b into vs2_1/2/3
 6498       vs_ldpq_post(vs2_1, b);
 6499       vs_ldpq_post(vs2_2, b);
 6500       if (i < 2) {
 6501         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6502       }
 6503       // sum 80 or 88 values across vs1 and vs2 into vs1
 6504       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6505       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6506       if (i < 2) {
 6507         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6508       }
 6509       // add constant to all 80 or 88 results
 6510       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6511       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6512       if (i < 2) {
 6513         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6514       }
 6515       // store 80 or 88 values
 6516       vs_stpq_post(vs1_1, result);
 6517       vs_stpq_post(vs1_2, result);
 6518       if (i < 2) {
 6519         __ str(vs1_3, __ Q, __ post(result, 16));
 6520       }
 6521     }
 6522 
 6523     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6524     __ mov(r0, zr); // return 0
 6525     __ ret(lr);
 6526 
 6527     // record the stub entry and end
 6528     store_archive_data(stub_id, start, __ pc());
 6529 
 6530     return start;
 6531   }
 6532 
 6533   // Kyber add 3 polynomials.
 6534   // Implements
 6535   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 6536   //
 6537   // result (short[256]) = c_rarg0
 6538   // a (short[256]) = c_rarg1
 6539   // b (short[256]) = c_rarg2
 6540   // c (short[256]) = c_rarg3
 6541   address generate_kyberAddPoly_3() {
 6542     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 6543     int entry_count = StubInfo::entry_count(stub_id);
 6544     assert(entry_count == 1, "sanity check");
 6545     address start = load_archive_data(stub_id);
 6546     if (start != nullptr) {
 6547       return start;
 6548     }
 6549     __ align(CodeEntryAlignment);
 6550     StubCodeMark mark(this, stub_id);
 6551     start = __ pc();
 6552     __ enter();
 6553 
 6554     const Register result = c_rarg0;
 6555     const Register a = c_rarg1;
 6556     const Register b = c_rarg2;
 6557     const Register c = c_rarg3;
 6558 
 6559     const Register kyberConsts = r11;
 6560 
 6561     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6562     // quadwords.  So, we can load, add and store the data in 3
 6563     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6564     // of 10 or 11 registers. A further constraint is that the
 6565     // mapping needs to skip callee saves. So, we allocate the
 6566     // register sequences using two 8 sequences, two 2 sequences
 6567     // and two single registers.
 6568     VSeq<8> vs1_1(0);
 6569     VSeq<2> vs1_2(16);
 6570     FloatRegister vs1_3 = v28;
 6571     VSeq<8> vs2_1(18);
 6572     VSeq<2> vs2_2(26);
 6573     FloatRegister vs2_3 = v29;
 6574 
 6575     // two constant vector sequences
 6576     VSeq<8> vc_1(31, 0);
 6577     VSeq<2> vc_2(31, 0);
 6578 
 6579     FloatRegister vc_3 = v31;
 6580 
 6581     __ lea(kyberConsts,
 6582              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6583 
 6584     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6585     for (int i = 0; i < 3; i++) {
 6586       // load 80 or 88 values from a into vs1_1/2/3
 6587       vs_ldpq_post(vs1_1, a);
 6588       vs_ldpq_post(vs1_2, a);
 6589       if (i < 2) {
 6590         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6591       }
 6592       // load 80 or 88 values from b into vs2_1/2/3
 6593       vs_ldpq_post(vs2_1, b);
 6594       vs_ldpq_post(vs2_2, b);
 6595       if (i < 2) {
 6596         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6597       }
 6598       // sum 80 or 88 values across vs1 and vs2 into vs1
 6599       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6600       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6601       if (i < 2) {
 6602         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6603       }
 6604       // load 80 or 88 values from c into vs2_1/2/3
 6605       vs_ldpq_post(vs2_1, c);
 6606       vs_ldpq_post(vs2_2, c);
 6607       if (i < 2) {
 6608         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6609       }
 6610       // sum 80 or 88 values across vs1 and vs2 into vs1
 6611       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6612       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6613       if (i < 2) {
 6614         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6615       }
 6616       // add constant to all 80 or 88 results
 6617       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6618       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6619       if (i < 2) {
 6620         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6621       }
 6622       // store 80 or 88 values
 6623       vs_stpq_post(vs1_1, result);
 6624       vs_stpq_post(vs1_2, result);
 6625       if (i < 2) {
 6626         __ str(vs1_3, __ Q, __ post(result, 16));
 6627       }
 6628     }
 6629 
 6630     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6631     __ mov(r0, zr); // return 0
 6632     __ ret(lr);
 6633 
 6634     // record the stub entry and end
 6635     store_archive_data(stub_id, start, __ pc());
 6636 
 6637     return start;
 6638   }
 6639 
 6640   // Kyber parse XOF output to polynomial coefficient candidates
 6641   // or decodePoly(12, ...).
 6642   // Implements
 6643   // static int implKyber12To16(
 6644   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6645   //
 6646   // we assume that parsed and condensed are allocated such that for
 6647   // n = (parsedLength + 63) / 64
 6648   // n blocks of 96 bytes of input can be processed, i.e.
 6649   // index + n * 96 <= condensed.length and
 6650   // n * 64 <= parsed.length
 6651   //
 6652   // condensed (byte[]) = c_rarg0
 6653   // condensedIndex = c_rarg1
 6654   // parsed (short[]) = c_rarg2
 6655   // parsedLength = c_rarg3
 6656   address generate_kyber12To16() {
 6657     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6658     int entry_count = StubInfo::entry_count(stub_id);
 6659     assert(entry_count == 1, "sanity check");
 6660     address start = load_archive_data(stub_id);
 6661     if (start != nullptr) {
 6662       return start;
 6663     }
 6664     Label L_F00, L_loop;
 6665 
 6666     __ align(CodeEntryAlignment);
 6667     StubCodeMark mark(this, stub_id);
 6668     start = __ pc();
 6669     __ enter();
 6670 
 6671     const Register condensed = c_rarg0;
 6672     const Register condensedOffs = c_rarg1;
 6673     const Register parsed = c_rarg2;
 6674     const Register parsedLength = c_rarg3;
 6675 
 6676     const Register tmpAddr = r11;
 6677 
 6678     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6679     // quadwords so we need a 6 vector sequence for the inputs.
 6680     // Parsing produces 64 shorts, employing two 8 vector
 6681     // sequences to store and combine the intermediate data.
 6682     VSeq<6> vin(24);
 6683     VSeq<8> va(0), vb(16);
 6684 
 6685     __ adr(tmpAddr, L_F00);
 6686     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6687     __ add(condensed, condensed, condensedOffs);
 6688 
 6689     __ BIND(L_loop);
 6690     // load 96 (6 x 16B) byte values
 6691     vs_ld3_post(vin, __ T16B, condensed);
 6692 
 6693     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6694     // holds 48 (16x3) contiguous bytes from memory striped
 6695     // horizontally across each of the 16 byte lanes. Equivalently,
 6696     // that is 16 pairs of 12-bit integers. Likewise the back half
 6697     // holds the next 48 bytes in the same arrangement.
 6698 
 6699     // Each vector in the front half can also be viewed as a vertical
 6700     // strip across the 16 pairs of 12 bit integers. Each byte in
 6701     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6702     // byte in vin[1] stores the high 4 bits of the first int and the
 6703     // low 4 bits of the second int. Each byte in vin[2] stores the
 6704     // high 8 bits of the second int. Likewise the vectors in second
 6705     // half.
 6706 
 6707     // Converting the data to 16-bit shorts requires first of all
 6708     // expanding each of the 6 x 16B vectors into 6 corresponding
 6709     // pairs of 8H vectors. Mask, shift and add operations on the
 6710     // resulting vector pairs can be used to combine 4 and 8 bit
 6711     // parts of related 8H vector elements.
 6712     //
 6713     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6714     // twice, one copy manipulated to provide the lower 4 bits
 6715     // belonging to the first short in a pair and another copy
 6716     // manipulated to provide the higher 4 bits belonging to the
 6717     // second short in a pair. This is why the the vector sequences va
 6718     // and vb used to hold the expanded 8H elements are of length 8.
 6719 
 6720     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6721     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6722     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6723     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6724     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6725     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6726     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6727     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6728 
 6729     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6730     // and vb[4:5]
 6731     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6732     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6733     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6734     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6735     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6736     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6737 
 6738     // shift lo byte of copy 1 of the middle stripe into the high byte
 6739     __ shl(va[2], __ T8H, va[2], 8);
 6740     __ shl(va[3], __ T8H, va[3], 8);
 6741     __ shl(vb[2], __ T8H, vb[2], 8);
 6742     __ shl(vb[3], __ T8H, vb[3], 8);
 6743 
 6744     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6745     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6746     // are in bit positions [4..11].
 6747     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6748     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6749     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6750     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6751 
 6752     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6753     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6754     // copy2
 6755     __ andr(va[2], __ T16B, va[2], v31);
 6756     __ andr(va[3], __ T16B, va[3], v31);
 6757     __ ushr(va[4], __ T8H, va[4], 4);
 6758     __ ushr(va[5], __ T8H, va[5], 4);
 6759     __ andr(vb[2], __ T16B, vb[2], v31);
 6760     __ andr(vb[3], __ T16B, vb[3], v31);
 6761     __ ushr(vb[4], __ T8H, vb[4], 4);
 6762     __ ushr(vb[5], __ T8H, vb[5], 4);
 6763 
 6764     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6765     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6766     // n.b. the ordering ensures: i) inputs are consumed before they
 6767     // are overwritten ii) the order of 16-bit results across successive
 6768     // pairs of vectors in va and then vb reflects the order of the
 6769     // corresponding 12-bit inputs
 6770     __ addv(va[0], __ T8H, va[0], va[2]);
 6771     __ addv(va[2], __ T8H, va[1], va[3]);
 6772     __ addv(va[1], __ T8H, va[4], va[6]);
 6773     __ addv(va[3], __ T8H, va[5], va[7]);
 6774     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6775     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6776     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6777     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6778 
 6779     // store 64 results interleaved as shorts
 6780     vs_st2_post(vs_front(va), __ T8H, parsed);
 6781     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6782 
 6783     __ sub(parsedLength, parsedLength, 64);
 6784     __ cmp(parsedLength, (u1)0);
 6785     __ br(Assembler::GT, L_loop);
 6786 
 6787     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6788     __ mov(r0, zr); // return 0
 6789     __ ret(lr);
 6790 
 6791     // bind label and generate constant data used by this stub
 6792     __ BIND(L_F00);
 6793     __ emit_int64(0x0f000f000f000f00);
 6794     __ emit_int64(0x0f000f000f000f00);
 6795 
 6796     // record the stub entry and end
 6797     store_archive_data(stub_id, start, __ pc());
 6798 
 6799     return start;
 6800   }
 6801 
 6802   // Kyber Barrett reduce function.
 6803   // Implements
 6804   // static int implKyberBarrettReduce(short[] coeffs) {}
 6805   //
 6806   // coeffs (short[256]) = c_rarg0
 6807   address generate_kyberBarrettReduce() {
 6808     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6809     int entry_count = StubInfo::entry_count(stub_id);
 6810     assert(entry_count == 1, "sanity check");
 6811     address start = load_archive_data(stub_id);
 6812     if (start != nullptr) {
 6813       return start;
 6814     }
 6815     __ align(CodeEntryAlignment);
 6816     StubCodeMark mark(this, stub_id);
 6817     start = __ pc();
 6818     __ enter();
 6819 
 6820     const Register coeffs = c_rarg0;
 6821 
 6822     const Register kyberConsts = r10;
 6823     const Register result = r11;
 6824 
 6825     // As above we process 256 sets of values in total i.e. 32 x
 6826     // 8H quadwords. So, we can load, add and store the data in 3
 6827     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6828     // of 10 or 11 registers. A further constraint is that the
 6829     // mapping needs to skip callee saves. So, we allocate the
 6830     // register sequences using two 8 sequences, two 2 sequences
 6831     // and two single registers.
 6832     VSeq<8> vs1_1(0);
 6833     VSeq<2> vs1_2(16);
 6834     FloatRegister vs1_3 = v28;
 6835     VSeq<8> vs2_1(18);
 6836     VSeq<2> vs2_2(26);
 6837     FloatRegister vs2_3 = v29;
 6838 
 6839     // we also need a pair of corresponding constant sequences
 6840 
 6841     VSeq<8> vc1_1(30, 0);
 6842     VSeq<2> vc1_2(30, 0);
 6843     FloatRegister vc1_3 = v30; // for kyber_q
 6844 
 6845     VSeq<8> vc2_1(31, 0);
 6846     VSeq<2> vc2_2(31, 0);
 6847     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6848 
 6849     __ add(result, coeffs, 0);
 6850     __ lea(kyberConsts,
 6851              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6852 
 6853     // load q and the multiplier for the Barrett reduction
 6854     __ add(kyberConsts, kyberConsts, 16);
 6855     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6856 
 6857     for (int i = 0; i < 3; i++) {
 6858       // load 80 or 88 coefficients
 6859       vs_ldpq_post(vs1_1, coeffs);
 6860       vs_ldpq_post(vs1_2, coeffs);
 6861       if (i < 2) {
 6862         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6863       }
 6864 
 6865       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6866       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6867       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6868       if (i < 2) {
 6869         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6870       }
 6871 
 6872       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6873       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6874       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6875       if (i < 2) {
 6876         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6877       }
 6878 
 6879       // vs1 <- vs1 - vs2 * kyber_q
 6880       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6881       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6882       if (i < 2) {
 6883         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6884       }
 6885 
 6886       vs_stpq_post(vs1_1, result);
 6887       vs_stpq_post(vs1_2, result);
 6888       if (i < 2) {
 6889         __ str(vs1_3, __ Q, __ post(result, 16));
 6890       }
 6891     }
 6892 
 6893     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6894     __ mov(r0, zr); // return 0
 6895     __ ret(lr);
 6896 
 6897     // record the stub entry and end
 6898     store_archive_data(stub_id, start, __ pc());
 6899 
 6900     return start;
 6901   }
 6902 
 6903 
 6904   // Dilithium-specific montmul helper routines that generate parallel
 6905   // code for, respectively, a single 4x4s vector sequence montmul or
 6906   // two such multiplies in a row.
 6907 
 6908   // Perform 16 32-bit Montgomery multiplications in parallel
 6909   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6910                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6911     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6912     // It will assert that the register use is valid
 6913     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6914   }
 6915 
 6916   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6917   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6918                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6919     // Schedule two successive 4x4S multiplies via the montmul helper
 6920     // on the front and back halves of va, vb and vc. The helper will
 6921     // assert that the register use has no overlap conflicts on each
 6922     // individual call but we also need to ensure that the necessary
 6923     // disjoint/equality constraints are met across both calls.
 6924 
 6925     // vb, vc, vtmp and vq must be disjoint. va must either be
 6926     // disjoint from all other registers or equal vc
 6927 
 6928     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6929     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6930     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6931 
 6932     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6933     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6934 
 6935     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6936 
 6937     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6938     assert(vs_disjoint(va, vb), "va and vb overlap");
 6939     assert(vs_disjoint(va, vq), "va and vq overlap");
 6940     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6941 
 6942     // We multiply the front and back halves of each sequence 4 at a
 6943     // time because
 6944     //
 6945     // 1) we are currently only able to get 4-way instruction
 6946     // parallelism at best
 6947     //
 6948     // 2) we need registers for the constants in vq and temporary
 6949     // scratch registers to hold intermediate results so vtmp can only
 6950     // be a VSeq<4> which means we only have 4 scratch slots.
 6951 
 6952     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6953     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6954   }
 6955 
 6956   // Perform combined montmul then add/sub on 4x4S vectors.
 6957   void dilithium_montmul16_sub_add(
 6958           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6959           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6960     // compute a = montmul(a1, c)
 6961     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6962     // ouptut a1 = a0 - a
 6963     vs_subv(va1, __ T4S, va0, vc);
 6964     //    and a0 = a0 + a
 6965     vs_addv(va0, __ T4S, va0, vc);
 6966   }
 6967 
 6968   // Perform combined add/sub then montul on 4x4S vectors.
 6969   void dilithium_sub_add_montmul16(
 6970           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6971           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6972     // compute c = a0 - a1
 6973     vs_subv(vtmp1, __ T4S, va0, va1);
 6974     // output a0 = a0 + a1
 6975     vs_addv(va0, __ T4S, va0, va1);
 6976     // output a1 = b montmul c
 6977     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6978   }
 6979 
 6980   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6981   // in the Java implementation come in sequences of at least 8, so we
 6982   // can use ldpq to collect the corresponding data into pairs of vector
 6983   // registers.
 6984   // We collect the coefficients corresponding to the 'j+l' indexes into
 6985   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6986   // then we do the (Montgomery) multiplications by the zetas in parallel
 6987   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6988   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6989   // v0-v7 and finally save the results back to the coeffs array.
 6990   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6991     const Register coeffs, const Register zetas) {
 6992     int c1 = 0;
 6993     int c2 = 512;
 6994     int startIncr;
 6995     // don't use callee save registers v8 - v15
 6996     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6997     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6998     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6999     int offsets[4] = { 0, 32, 64, 96 };
 7000 
 7001     for (int level = 0; level < 5; level++) {
 7002       int c1Start = c1;
 7003       int c2Start = c2;
 7004       if (level == 3) {
 7005         offsets[1] = 32;
 7006         offsets[2] = 128;
 7007         offsets[3] = 160;
 7008       } else if (level == 4) {
 7009         offsets[1] = 64;
 7010         offsets[2] = 128;
 7011         offsets[3] = 192;
 7012       }
 7013 
 7014       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 7015       // time at 4 different offsets and multiply them in order by the
 7016       // next set of input values. So we employ indexed load and store
 7017       // pair instructions with arrangement 4S.
 7018       for (int i = 0; i < 4; i++) {
 7019         // reload q and qinv
 7020         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7021         // load 8x4S coefficients via second start pos == c2
 7022         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 7023         // load next 8x4S inputs == b
 7024         vs_ldpq_post(vs2, zetas);
 7025         // compute a == c2 * b mod MONT_Q
 7026         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7027         // load 8x4s coefficients via first start pos == c1
 7028         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7029         // compute a1 =  c1 + a
 7030         vs_addv(vs3, __ T4S, vs1, vs2);
 7031         // compute a2 =  c1 - a
 7032         vs_subv(vs1, __ T4S, vs1, vs2);
 7033         // output a1 and a2
 7034         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7035         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 7036 
 7037         int k = 4 * level + i;
 7038 
 7039         if (k > 7) {
 7040           startIncr = 256;
 7041         } else if (k == 5) {
 7042           startIncr = 384;
 7043         } else {
 7044           startIncr = 128;
 7045         }
 7046 
 7047         c1Start += startIncr;
 7048         c2Start += startIncr;
 7049       }
 7050 
 7051       c2 /= 2;
 7052     }
 7053   }
 7054 
 7055   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 7056   // Implements the method
 7057   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 7058   // of the Java class sun.security.provider
 7059   //
 7060   // coeffs (int[256]) = c_rarg0
 7061   // zetas (int[256]) = c_rarg1
 7062   address generate_dilithiumAlmostNtt() {
 7063     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 7064     int entry_count = StubInfo::entry_count(stub_id);
 7065     assert(entry_count == 1, "sanity check");
 7066     address start = load_archive_data(stub_id);
 7067     if (start != nullptr) {
 7068       return start;
 7069     }
 7070     __ align(CodeEntryAlignment);
 7071     StubCodeMark mark(this, stub_id);
 7072     start = __ pc();
 7073     __ enter();
 7074 
 7075     const Register coeffs = c_rarg0;
 7076     const Register zetas = c_rarg1;
 7077 
 7078     const Register tmpAddr = r9;
 7079     const Register dilithiumConsts = r10;
 7080     const Register result = r11;
 7081     // don't use callee save registers v8 - v15
 7082     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7083     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7084     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7085     int offsets[4] = { 0, 32, 64, 96};
 7086     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7087     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7088     __ add(result, coeffs, 0);
 7089     __ lea(dilithiumConsts,
 7090              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7091 
 7092     // Each level represents one iteration of the outer for loop of the Java version.
 7093 
 7094     // level 0-4
 7095     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 7096 
 7097     // level 5
 7098 
 7099     // At level 5 the coefficients we need to combine with the zetas
 7100     // are grouped in memory in blocks of size 4. So, for both sets of
 7101     // coefficients we load 4 adjacent values at 8 different offsets
 7102     // using an indexed ldr with register variant Q and multiply them
 7103     // in sequence order by the next set of inputs. Likewise we store
 7104     // the resuls using an indexed str with register variant Q.
 7105     for (int i = 0; i < 1024; i += 256) {
 7106       // reload constants q, qinv each iteration as they get clobbered later
 7107       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7108       // load 32 (8x4S) coefficients via first offsets = c1
 7109       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7110       // load next 32 (8x4S) inputs = b
 7111       vs_ldpq_post(vs2, zetas);
 7112       // a = b montul c1
 7113       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7114       // load 32 (8x4S) coefficients via second offsets = c2
 7115       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 7116       // add/sub with result of multiply
 7117       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 7118       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 7119       // write back new coefficients using same offsets
 7120       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 7121       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 7122     }
 7123 
 7124     // level 6
 7125     // At level 6 the coefficients we need to combine with the zetas
 7126     // are grouped in memory in pairs, the first two being montmul
 7127     // inputs and the second add/sub inputs. We can still implement
 7128     // the montmul+sub+add using 4-way parallelism but only if we
 7129     // combine the coefficients with the zetas 16 at a time. We load 8
 7130     // adjacent values at 4 different offsets using an ld2 load with
 7131     // arrangement 2D. That interleaves the lower and upper halves of
 7132     // each pair of quadwords into successive vector registers. We
 7133     // then need to montmul the 4 even elements of the coefficients
 7134     // register sequence by the zetas in order and then add/sub the 4
 7135     // odd elements of the coefficients register sequence. We use an
 7136     // equivalent st2 operation to store the results back into memory
 7137     // de-interleaved.
 7138     for (int i = 0; i < 1024; i += 128) {
 7139       // reload constants q, qinv each iteration as they get clobbered later
 7140       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7141       // load interleaved 16 (4x2D) coefficients via offsets
 7142       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7143       // load next 16 (4x4S) inputs
 7144       vs_ldpq_post(vs_front(vs2), zetas);
 7145       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7146       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7147                                   vs_front(vs2), vtmp, vq);
 7148       // store interleaved 16 (4x2D) coefficients via offsets
 7149       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7150     }
 7151 
 7152     // level 7
 7153     // At level 7 the coefficients we need to combine with the zetas
 7154     // occur singly with montmul inputs alterating with add/sub
 7155     // inputs. Once again we can use 4-way parallelism to combine 16
 7156     // zetas at a time. However, we have to load 8 adjacent values at
 7157     // 4 different offsets using an ld2 load with arrangement 4S. That
 7158     // interleaves the the odd words of each pair into one
 7159     // coefficients vector register and the even words of the pair
 7160     // into the next register. We then need to montmul the 4 even
 7161     // elements of the coefficients register sequence by the zetas in
 7162     // order and then add/sub the 4 odd elements of the coefficients
 7163     // register sequence. We use an equivalent st2 operation to store
 7164     // the results back into memory de-interleaved.
 7165 
 7166     for (int i = 0; i < 1024; i += 128) {
 7167       // reload constants q, qinv each iteration as they get clobbered later
 7168       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7169       // load interleaved 16 (4x4S) coefficients via offsets
 7170       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7171       // load next 16 (4x4S) inputs
 7172       vs_ldpq_post(vs_front(vs2), zetas);
 7173       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7174       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7175                                   vs_front(vs2), vtmp, vq);
 7176       // store interleaved 16 (4x4S) coefficients via offsets
 7177       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7178     }
 7179     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7180     __ mov(r0, zr); // return 0
 7181     __ ret(lr);
 7182 
 7183     // record the stub entry and end
 7184     store_archive_data(stub_id, start, __ pc());
 7185 
 7186     return start;
 7187   }
 7188 
 7189   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 7190   // in the Java implementation come in sequences of at least 8, so we
 7191   // can use ldpq to collect the corresponding data into pairs of vector
 7192   // registers
 7193   // We collect the coefficients that correspond to the 'j's into vs1
 7194   // the coefficiets that correspond to the 'j+l's into vs2 then
 7195   // do the additions into vs3 and the subtractions into vs1 then
 7196   // save the result of the additions, load the zetas into vs2
 7197   // do the (Montgomery) multiplications by zeta in parallel into vs2
 7198   // finally save the results back to the coeffs array
 7199   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 7200     const Register coeffs, const Register zetas) {
 7201     int c1 = 0;
 7202     int c2 = 32;
 7203     int startIncr;
 7204     int offsets[4];
 7205     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7206     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7207     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7208 
 7209     offsets[0] = 0;
 7210 
 7211     for (int level = 3; level < 8; level++) {
 7212       int c1Start = c1;
 7213       int c2Start = c2;
 7214       if (level == 3) {
 7215         offsets[1] = 64;
 7216         offsets[2] = 128;
 7217         offsets[3] = 192;
 7218       } else if (level == 4) {
 7219         offsets[1] = 32;
 7220         offsets[2] = 128;
 7221         offsets[3] = 160;
 7222       } else {
 7223         offsets[1] = 32;
 7224         offsets[2] = 64;
 7225         offsets[3] = 96;
 7226       }
 7227 
 7228       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 7229       // time at 4 different offsets and multiply them in order by the
 7230       // next set of input values. So we employ indexed load and store
 7231       // pair instructions with arrangement 4S.
 7232       for (int i = 0; i < 4; i++) {
 7233         // load v1 32 (8x4S) coefficients relative to first start index
 7234         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7235         // load v2 32 (8x4S) coefficients relative to second start index
 7236         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 7237         // a0 = v1 + v2 -- n.b. clobbers vqs
 7238         vs_addv(vs3, __ T4S, vs1, vs2);
 7239         // a1 = v1 - v2
 7240         vs_subv(vs1, __ T4S, vs1, vs2);
 7241         // save a1 relative to first start index
 7242         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7243         // load constants q, qinv each iteration as they get clobbered above
 7244         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7245         // load b next 32 (8x4S) inputs
 7246         vs_ldpq_post(vs2, zetas);
 7247         // a = a1 montmul b
 7248         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7249         // save a relative to second start index
 7250         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 7251 
 7252         int k = 4 * level + i;
 7253 
 7254         if (k < 24) {
 7255           startIncr = 256;
 7256         } else if (k == 25) {
 7257           startIncr = 384;
 7258         } else {
 7259           startIncr = 128;
 7260         }
 7261 
 7262         c1Start += startIncr;
 7263         c2Start += startIncr;
 7264       }
 7265 
 7266       c2 *= 2;
 7267     }
 7268   }
 7269 
 7270   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 7271   // Implements the method
 7272   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 7273   // the sun.security.provider.ML_DSA class.
 7274   //
 7275   // coeffs (int[256]) = c_rarg0
 7276   // zetas (int[256]) = c_rarg1
 7277   address generate_dilithiumAlmostInverseNtt() {
 7278     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 7279     int entry_count = StubInfo::entry_count(stub_id);
 7280     assert(entry_count == 1, "sanity check");
 7281     address start = load_archive_data(stub_id);
 7282     if (start != nullptr) {
 7283       return start;
 7284     }
 7285     __ align(CodeEntryAlignment);
 7286     StubCodeMark mark(this, stub_id);
 7287     start = __ pc();
 7288     __ enter();
 7289 
 7290     const Register coeffs = c_rarg0;
 7291     const Register zetas = c_rarg1;
 7292 
 7293     const Register tmpAddr = r9;
 7294     const Register dilithiumConsts = r10;
 7295     const Register result = r11;
 7296     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7297     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 7298     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7299     int offsets[4] = { 0, 32, 64, 96 };
 7300     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7301     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7302 
 7303     __ add(result, coeffs, 0);
 7304     __ lea(dilithiumConsts,
 7305              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7306 
 7307     // Each level represents one iteration of the outer for loop of the Java version
 7308 
 7309     // level 0
 7310     // At level 0 we need to interleave adjacent quartets of
 7311     // coefficients before we multiply and add/sub by the next 16
 7312     // zetas just as we did for level 7 in the multiply code. So we
 7313     // load and store the values using an ld2/st2 with arrangement 4S.
 7314     for (int i = 0; i < 1024; i += 128) {
 7315       // load constants q, qinv
 7316       // n.b. this can be moved out of the loop as they do not get
 7317       // clobbered by first two loops
 7318       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7319       // a0/a1 load interleaved 32 (8x4S) coefficients
 7320       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7321       // b load next 32 (8x4S) inputs
 7322       vs_ldpq_post(vs_front(vs2), zetas);
 7323       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7324       // n.b. second half of vs2 provides temporary register storage
 7325       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7326                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7327       // a0/a1 store interleaved 32 (8x4S) coefficients
 7328       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7329     }
 7330 
 7331     // level 1
 7332     // At level 1 we need to interleave pairs of adjacent pairs of
 7333     // coefficients before we multiply by the next 16 zetas just as we
 7334     // did for level 6 in the multiply code. So we load and store the
 7335     // values an ld2/st2 with arrangement 2D.
 7336     for (int i = 0; i < 1024; i += 128) {
 7337       // a0/a1 load interleaved 32 (8x2D) coefficients
 7338       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7339       // b load next 16 (4x4S) inputs
 7340       vs_ldpq_post(vs_front(vs2), zetas);
 7341       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7342       // n.b. second half of vs2 provides temporary register storage
 7343       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7344                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7345       // a0/a1 store interleaved 32 (8x2D) coefficients
 7346       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7347     }
 7348 
 7349     // level 2
 7350     // At level 2 coefficients come in blocks of 4. So, we load 4
 7351     // adjacent coefficients at 8 distinct offsets for both the first
 7352     // and second coefficient sequences, using an ldr with register
 7353     // variant Q then combine them with next set of 32 zetas. Likewise
 7354     // we store the results using an str with register variant Q.
 7355     for (int i = 0; i < 1024; i += 256) {
 7356       // c0 load 32 (8x4S) coefficients via first offsets
 7357       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7358       // c1 load 32 (8x4S) coefficients via second offsets
 7359       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 7360       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 7361       vs_addv(vs3, __ T4S, vs1, vs2);
 7362       // c = c0 - c1
 7363       vs_subv(vs1, __ T4S, vs1, vs2);
 7364       // store a0 32 (8x4S) coefficients via first offsets
 7365       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 7366       // b load 32 (8x4S) next inputs
 7367       vs_ldpq_post(vs2, zetas);
 7368       // reload constants q, qinv -- they were clobbered earlier
 7369       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7370       // compute a1 = b montmul c
 7371       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7372       // store a1 32 (8x4S) coefficients via second offsets
 7373       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 7374     }
 7375 
 7376     // level 3-7
 7377     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 7378 
 7379     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7380     __ mov(r0, zr); // return 0
 7381     __ ret(lr);
 7382 
 7383     // record the stub entry and end
 7384     store_archive_data(stub_id, start, __ pc());
 7385 
 7386     return start;
 7387   }
 7388 
 7389   // Dilithium multiply polynomials in the NTT domain.
 7390   // Straightforward implementation of the method
 7391   // static int implDilithiumNttMult(
 7392   //              int[] result, int[] ntta, int[] nttb {} of
 7393   // the sun.security.provider.ML_DSA class.
 7394   //
 7395   // result (int[256]) = c_rarg0
 7396   // poly1 (int[256]) = c_rarg1
 7397   // poly2 (int[256]) = c_rarg2
 7398   address generate_dilithiumNttMult() {
 7399     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 7400     int entry_count = StubInfo::entry_count(stub_id);
 7401     assert(entry_count == 1, "sanity check");
 7402     address start = load_archive_data(stub_id);
 7403     if (start != nullptr) {
 7404       return start;
 7405     }
 7406     __ align(CodeEntryAlignment);
 7407     StubCodeMark mark(this, stub_id);
 7408     start = __ pc();
 7409     __ enter();
 7410 
 7411     Label L_loop;
 7412 
 7413     const Register result = c_rarg0;
 7414     const Register poly1 = c_rarg1;
 7415     const Register poly2 = c_rarg2;
 7416 
 7417     const Register dilithiumConsts = r10;
 7418     const Register len = r11;
 7419 
 7420     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7421     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7422     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7423     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 7424 
 7425     __ lea(dilithiumConsts,
 7426              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7427 
 7428     // load constants q, qinv
 7429     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7430     // load constant rSquare into v29
 7431     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 7432 
 7433     __ mov(len, zr);
 7434     __ add(len, len, 1024);
 7435 
 7436     __ BIND(L_loop);
 7437 
 7438     // b load 32 (8x4S) next inputs from poly1
 7439     vs_ldpq_post(vs1, poly1);
 7440     // c load 32 (8x4S) next inputs from poly2
 7441     vs_ldpq_post(vs2, poly2);
 7442     // compute a = b montmul c
 7443     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7444     // compute a = rsquare montmul a
 7445     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 7446     // save a 32 (8x4S) results
 7447     vs_stpq_post(vs2, result);
 7448 
 7449     __ sub(len, len, 128);
 7450     __ cmp(len, (u1)128);
 7451     __ br(Assembler::GE, L_loop);
 7452 
 7453     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7454     __ mov(r0, zr); // return 0
 7455     __ ret(lr);
 7456 
 7457     // record the stub entry and end
 7458     store_archive_data(stub_id, start, __ pc());
 7459 
 7460     return start;
 7461   }
 7462 
 7463   // Dilithium Motgomery multiply an array by a constant.
 7464   // A straightforward implementation of the method
 7465   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 7466   // of the sun.security.provider.MLDSA class
 7467   //
 7468   // coeffs (int[256]) = c_rarg0
 7469   // constant (int) = c_rarg1
 7470   address generate_dilithiumMontMulByConstant() {
 7471     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 7472     int entry_count = StubInfo::entry_count(stub_id);
 7473     assert(entry_count == 1, "sanity check");
 7474     address start = load_archive_data(stub_id);
 7475     if (start != nullptr) {
 7476       return start;
 7477     }
 7478     __ align(CodeEntryAlignment);
 7479     StubCodeMark mark(this, stub_id);
 7480     start = __ pc();
 7481     __ enter();
 7482 
 7483     Label L_loop;
 7484 
 7485     const Register coeffs = c_rarg0;
 7486     const Register constant = c_rarg1;
 7487 
 7488     const Register dilithiumConsts = r10;
 7489     const Register result = r11;
 7490     const Register len = r12;
 7491 
 7492     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7493     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7494     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7495     VSeq<8> vconst(29, 0);             // for montmul by constant
 7496 
 7497     // results track inputs
 7498     __ add(result, coeffs, 0);
 7499     __ lea(dilithiumConsts,
 7500              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7501 
 7502     // load constants q, qinv -- they do not get clobbered by first two loops
 7503     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7504     // copy caller supplied constant across vconst
 7505     __ dup(vconst[0], __ T4S, constant);
 7506     __ mov(len, zr);
 7507     __ add(len, len, 1024);
 7508 
 7509     __ BIND(L_loop);
 7510 
 7511     // load next 32 inputs
 7512     vs_ldpq_post(vs2, coeffs);
 7513     // mont mul by constant
 7514     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 7515     // write next 32 results
 7516     vs_stpq_post(vs2, result);
 7517 
 7518     __ sub(len, len, 128);
 7519     __ cmp(len, (u1)128);
 7520     __ br(Assembler::GE, L_loop);
 7521 
 7522     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7523     __ mov(r0, zr); // return 0
 7524     __ ret(lr);
 7525 
 7526     // record the stub entry and end
 7527     store_archive_data(stub_id, start, __ pc());
 7528 
 7529     return start;
 7530   }
 7531 
 7532   // Dilithium decompose poly.
 7533   // Implements the method
 7534   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 7535   // of the sun.security.provider.ML_DSA class
 7536   //
 7537   // input (int[256]) = c_rarg0
 7538   // lowPart (int[256]) = c_rarg1
 7539   // highPart (int[256]) = c_rarg2
 7540   // twoGamma2  (int) = c_rarg3
 7541   // multiplier (int) = c_rarg4
 7542   address generate_dilithiumDecomposePoly() {
 7543     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 7544     int entry_count = StubInfo::entry_count(stub_id);
 7545     assert(entry_count == 1, "sanity check");
 7546     address start = load_archive_data(stub_id);
 7547     if (start != nullptr) {
 7548       return start;
 7549     }
 7550     __ align(CodeEntryAlignment);
 7551     StubCodeMark mark(this, stub_id);
 7552     start = __ pc();
 7553     Label L_loop;
 7554 
 7555     const Register input = c_rarg0;
 7556     const Register lowPart = c_rarg1;
 7557     const Register highPart = c_rarg2;
 7558     const Register twoGamma2 = c_rarg3;
 7559     const Register multiplier = c_rarg4;
 7560 
 7561     const Register len = r9;
 7562     const Register dilithiumConsts = r10;
 7563     const Register tmp = r11;
 7564 
 7565     // 6 independent sets of 4x4s values
 7566     VSeq<4> vs1(0), vs2(4), vs3(8);
 7567     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7568 
 7569     // 7 constants for cross-multiplying
 7570     VSeq<4> one(25, 0);
 7571     VSeq<4> qminus1(26, 0);
 7572     VSeq<4> g2(27, 0);
 7573     VSeq<4> twog2(28, 0);
 7574     VSeq<4> mult(29, 0);
 7575     VSeq<4> q(30, 0);
 7576     VSeq<4> qadd(31, 0);
 7577 
 7578     __ enter();
 7579 
 7580     __ lea(dilithiumConsts,
 7581              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7582 
 7583     // save callee-saved registers
 7584     __ stpd(v8, v9, __ pre(sp, -64));
 7585     __ stpd(v10, v11, Address(sp, 16));
 7586     __ stpd(v12, v13, Address(sp, 32));
 7587     __ stpd(v14, v15, Address(sp, 48));
 7588 
 7589     // populate constant registers
 7590     __ mov(tmp, zr);
 7591     __ add(tmp, tmp, 1);
 7592     __ dup(one[0], __ T4S, tmp); // 1
 7593     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7594     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7595     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7596     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7597     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7598     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7599 
 7600     __ mov(len, zr);
 7601     __ add(len, len, 1024);
 7602 
 7603     __ BIND(L_loop);
 7604 
 7605     // load next 4x4S inputs interleaved: rplus --> vs1
 7606     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7607 
 7608     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7609     vs_addv(vtmp, __ T4S, vs1, qadd);
 7610     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7611     vs_mulv(vtmp, __ T4S, vtmp, q);
 7612     vs_subv(vs1, __ T4S, vs1, vtmp);
 7613 
 7614     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7615     vs_sshr(vtmp, __ T4S, vs1, 31);
 7616     vs_andr(vtmp, vtmp, q);
 7617     vs_addv(vs1, __ T4S, vs1, vtmp);
 7618 
 7619     // quotient --> vs2
 7620     // int quotient = (rplus * multiplier) >> 22;
 7621     vs_mulv(vtmp, __ T4S, vs1, mult);
 7622     vs_sshr(vs2, __ T4S, vtmp, 22);
 7623 
 7624     // r0 --> vs3
 7625     // int r0 = rplus - quotient * twoGamma2;
 7626     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7627     vs_subv(vs3, __ T4S, vs1, vtmp);
 7628 
 7629     // mask --> vs4
 7630     // int mask = (twoGamma2 - r0) >> 22;
 7631     vs_subv(vtmp, __ T4S, twog2, vs3);
 7632     vs_sshr(vs4, __ T4S, vtmp, 22);
 7633 
 7634     // r0 -= (mask & twoGamma2);
 7635     vs_andr(vtmp, vs4, twog2);
 7636     vs_subv(vs3, __ T4S, vs3, vtmp);
 7637 
 7638     //  quotient += (mask & 1);
 7639     vs_andr(vtmp, vs4, one);
 7640     vs_addv(vs2, __ T4S, vs2, vtmp);
 7641 
 7642     // mask = (twoGamma2 / 2 - r0) >> 31;
 7643     vs_subv(vtmp, __ T4S, g2, vs3);
 7644     vs_sshr(vs4, __ T4S, vtmp, 31);
 7645 
 7646     // r0 -= (mask & twoGamma2);
 7647     vs_andr(vtmp, vs4, twog2);
 7648     vs_subv(vs3, __ T4S, vs3, vtmp);
 7649 
 7650     // quotient += (mask & 1);
 7651     vs_andr(vtmp, vs4, one);
 7652     vs_addv(vs2, __ T4S, vs2, vtmp);
 7653 
 7654     // r1 --> vs5
 7655     // int r1 = rplus - r0 - (dilithium_q - 1);
 7656     vs_subv(vtmp, __ T4S, vs1, vs3);
 7657     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7658 
 7659     // r1 --> vs1 (overwriting rplus)
 7660     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7661     vs_negr(vtmp, __ T4S, vs5);
 7662     vs_orr(vtmp, vs5, vtmp);
 7663     vs_sshr(vs1, __ T4S, vtmp, 31);
 7664 
 7665     // r0 += ~r1;
 7666     vs_notr(vtmp, vs1);
 7667     vs_addv(vs3, __ T4S, vs3, vtmp);
 7668 
 7669     // r1 = r1 & quotient;
 7670     vs_andr(vs1, vs2, vs1);
 7671 
 7672     // store results inteleaved
 7673     // lowPart[m] = r0;
 7674     // highPart[m] = r1;
 7675     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7676     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7677 
 7678     __ sub(len, len, 64);
 7679     __ cmp(len, (u1)64);
 7680     __ br(Assembler::GE, L_loop);
 7681 
 7682     // restore callee-saved vector registers
 7683     __ ldpd(v14, v15, Address(sp, 48));
 7684     __ ldpd(v12, v13, Address(sp, 32));
 7685     __ ldpd(v10, v11, Address(sp, 16));
 7686     __ ldpd(v8, v9, __ post(sp, 64));
 7687 
 7688     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7689     __ mov(r0, zr); // return 0
 7690     __ ret(lr);
 7691 
 7692     // record the stub entry and end
 7693     store_archive_data(stub_id, start, __ pc());
 7694 
 7695     return start;
 7696   }
 7697 
 7698   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7699              Register tmp0, Register tmp1, Register tmp2) {
 7700     __ bic(tmp0, a2, a1); // for a0
 7701     __ bic(tmp1, a3, a2); // for a1
 7702     __ bic(tmp2, a4, a3); // for a2
 7703     __ eor(a2, a2, tmp2);
 7704     __ bic(tmp2, a0, a4); // for a3
 7705     __ eor(a3, a3, tmp2);
 7706     __ bic(tmp2, a1, a0); // for a4
 7707     __ eor(a0, a0, tmp0);
 7708     __ eor(a1, a1, tmp1);
 7709     __ eor(a4, a4, tmp2);
 7710   }
 7711 
 7712   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7713                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7714                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7715                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7716                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7717                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7718                         Register tmp0, Register tmp1, Register tmp2) {
 7719     __ eor3(tmp1, a4, a9, a14);
 7720     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7721     __ eor3(tmp2, a1, a6, a11);
 7722     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7723     __ rax1(tmp2, tmp0, tmp1); // d0
 7724     {
 7725 
 7726       Register tmp3, tmp4;
 7727       if (can_use_fp && can_use_r18) {
 7728         tmp3 = rfp;
 7729         tmp4 = r18_tls;
 7730       } else {
 7731         tmp3 = a4;
 7732         tmp4 = a9;
 7733         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7734       }
 7735 
 7736       __ eor3(tmp3, a0, a5, a10);
 7737       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7738       __ eor(a0, a0, tmp2);
 7739       __ eor(a5, a5, tmp2);
 7740       __ eor(a10, a10, tmp2);
 7741       __ eor(a15, a15, tmp2);
 7742       __ eor(a20, a20, tmp2); // d0(tmp2)
 7743       __ eor3(tmp3, a2, a7, a12);
 7744       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7745       __ rax1(tmp3, tmp4, tmp2); // d1
 7746       __ eor(a1, a1, tmp3);
 7747       __ eor(a6, a6, tmp3);
 7748       __ eor(a11, a11, tmp3);
 7749       __ eor(a16, a16, tmp3);
 7750       __ eor(a21, a21, tmp3); // d1(tmp3)
 7751       __ rax1(tmp3, tmp2, tmp0); // d3
 7752       __ eor3(tmp2, a3, a8, a13);
 7753       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7754       __ eor(a3, a3, tmp3);
 7755       __ eor(a8, a8, tmp3);
 7756       __ eor(a13, a13, tmp3);
 7757       __ eor(a18, a18, tmp3);
 7758       __ eor(a23, a23, tmp3);
 7759       __ rax1(tmp2, tmp1, tmp0); // d2
 7760       __ eor(a2, a2, tmp2);
 7761       __ eor(a7, a7, tmp2);
 7762       __ eor(a12, a12, tmp2);
 7763       __ rax1(tmp0, tmp0, tmp4); // d4
 7764       if (!can_use_fp || !can_use_r18) {
 7765         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7766       }
 7767       __ eor(a17, a17, tmp2);
 7768       __ eor(a22, a22, tmp2);
 7769       __ eor(a4, a4, tmp0);
 7770       __ eor(a9, a9, tmp0);
 7771       __ eor(a14, a14, tmp0);
 7772       __ eor(a19, a19, tmp0);
 7773       __ eor(a24, a24, tmp0);
 7774     }
 7775 
 7776     __ rol(tmp0, a10, 3);
 7777     __ rol(a10, a1, 1);
 7778     __ rol(a1, a6, 44);
 7779     __ rol(a6, a9, 20);
 7780     __ rol(a9, a22, 61);
 7781     __ rol(a22, a14, 39);
 7782     __ rol(a14, a20, 18);
 7783     __ rol(a20, a2, 62);
 7784     __ rol(a2, a12, 43);
 7785     __ rol(a12, a13, 25);
 7786     __ rol(a13, a19, 8) ;
 7787     __ rol(a19, a23, 56);
 7788     __ rol(a23, a15, 41);
 7789     __ rol(a15, a4, 27);
 7790     __ rol(a4, a24, 14);
 7791     __ rol(a24, a21, 2);
 7792     __ rol(a21, a8, 55);
 7793     __ rol(a8, a16, 45);
 7794     __ rol(a16, a5, 36);
 7795     __ rol(a5, a3, 28);
 7796     __ rol(a3, a18, 21);
 7797     __ rol(a18, a17, 15);
 7798     __ rol(a17, a11, 10);
 7799     __ rol(a11, a7, 6);
 7800     __ mov(a7, tmp0);
 7801 
 7802     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7803     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7804     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7805     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7806     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7807 
 7808     __ ldr(tmp1, __ post(rc, 8));
 7809     __ eor(a0, a0, tmp1);
 7810 
 7811   }
 7812 
 7813   // Arguments:
 7814   //
 7815   // Inputs:
 7816   //   c_rarg0   - byte[]  source+offset
 7817   //   c_rarg1   - byte[]  SHA.state
 7818   //   c_rarg2   - int     block_size
 7819   //   c_rarg3   - int     offset
 7820   //   c_rarg4   - int     limit
 7821   //
 7822   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7823     bool multi_block;
 7824     switch (stub_id) {
 7825     case StubId::stubgen_sha3_implCompress_id:
 7826       multi_block = false;
 7827       break;
 7828     case StubId::stubgen_sha3_implCompressMB_id:
 7829       multi_block = true;
 7830       break;
 7831     default:
 7832       ShouldNotReachHere();
 7833     }
 7834     int entry_count = StubInfo::entry_count(stub_id);
 7835     assert(entry_count == 1, "sanity check");
 7836     address start = load_archive_data(stub_id);
 7837     if (start != nullptr) {
 7838       return start;
 7839     }
 7840     __ align(CodeEntryAlignment);
 7841     StubCodeMark mark(this, stub_id);
 7842     start = __ pc();
 7843 
 7844     Register buf           = c_rarg0;
 7845     Register state         = c_rarg1;
 7846     Register block_size    = c_rarg2;
 7847     Register ofs           = c_rarg3;
 7848     Register limit         = c_rarg4;
 7849 
 7850     // use r3.r17,r19..r28 to keep a0..a24.
 7851     // a0..a24 are respective locals from SHA3.java
 7852     Register a0 = r25,
 7853              a1 = r26,
 7854              a2 = r27,
 7855              a3 = r3,
 7856              a4 = r4,
 7857              a5 = r5,
 7858              a6 = r6,
 7859              a7 = r7,
 7860              a8 = rscratch1, // r8
 7861              a9 = rscratch2, // r9
 7862              a10 = r10,
 7863              a11 = r11,
 7864              a12 = r12,
 7865              a13 = r13,
 7866              a14 = r14,
 7867              a15 = r15,
 7868              a16 = r16,
 7869              a17 = r17,
 7870              a18 = r28,
 7871              a19 = r19,
 7872              a20 = r20,
 7873              a21 = r21,
 7874              a22 = r22,
 7875              a23 = r23,
 7876              a24 = r24;
 7877 
 7878     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7879 
 7880     Label sha3_loop, rounds24_preloop, loop_body;
 7881     Label sha3_512_or_sha3_384, shake128;
 7882 
 7883     bool can_use_r18 = false;
 7884 #ifndef R18_RESERVED
 7885     can_use_r18 = true;
 7886 #endif
 7887     bool can_use_fp = !PreserveFramePointer;
 7888 
 7889     __ enter();
 7890 
 7891     // save almost all yet unsaved gpr registers on stack
 7892     __ str(block_size, __ pre(sp, -128));
 7893     if (multi_block) {
 7894       __ stpw(ofs, limit, Address(sp, 8));
 7895     }
 7896     // 8 bytes at sp+16 will be used to keep buf
 7897     __ stp(r19, r20, Address(sp, 32));
 7898     __ stp(r21, r22, Address(sp, 48));
 7899     __ stp(r23, r24, Address(sp, 64));
 7900     __ stp(r25, r26, Address(sp, 80));
 7901     __ stp(r27, r28, Address(sp, 96));
 7902     if (can_use_r18 && can_use_fp) {
 7903       __ stp(r18_tls, state, Address(sp, 112));
 7904     } else {
 7905       __ str(state, Address(sp, 112));
 7906     }
 7907 
 7908     // begin sha3 calculations: loading a0..a24 from state arrary
 7909     __ ldp(a0, a1, state);
 7910     __ ldp(a2, a3, Address(state, 16));
 7911     __ ldp(a4, a5, Address(state, 32));
 7912     __ ldp(a6, a7, Address(state, 48));
 7913     __ ldp(a8, a9, Address(state, 64));
 7914     __ ldp(a10, a11, Address(state, 80));
 7915     __ ldp(a12, a13, Address(state, 96));
 7916     __ ldp(a14, a15, Address(state, 112));
 7917     __ ldp(a16, a17, Address(state, 128));
 7918     __ ldp(a18, a19, Address(state, 144));
 7919     __ ldp(a20, a21, Address(state, 160));
 7920     __ ldp(a22, a23, Address(state, 176));
 7921     __ ldr(a24, Address(state, 192));
 7922 
 7923     __ BIND(sha3_loop);
 7924 
 7925     // load input
 7926     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7927     __ eor(a0, a0, tmp3);
 7928     __ eor(a1, a1, tmp2);
 7929     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7930     __ eor(a2, a2, tmp3);
 7931     __ eor(a3, a3, tmp2);
 7932     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7933     __ eor(a4, a4, tmp3);
 7934     __ eor(a5, a5, tmp2);
 7935     __ ldr(tmp3, __ post(buf, 8));
 7936     __ eor(a6, a6, tmp3);
 7937 
 7938     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7939     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7940 
 7941     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7942     __ eor(a7, a7, tmp3);
 7943     __ eor(a8, a8, tmp2);
 7944     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7945     __ eor(a9, a9, tmp3);
 7946     __ eor(a10, a10, tmp2);
 7947     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7948     __ eor(a11, a11, tmp3);
 7949     __ eor(a12, a12, tmp2);
 7950     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7951     __ eor(a13, a13, tmp3);
 7952     __ eor(a14, a14, tmp2);
 7953     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7954     __ eor(a15, a15, tmp3);
 7955     __ eor(a16, a16, tmp2);
 7956 
 7957     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7958     __ andw(tmp2, block_size, 48);
 7959     __ cbzw(tmp2, rounds24_preloop);
 7960     __ tbnz(block_size, 5, shake128);
 7961     // block_size == 144, bit5 == 0, SHA3-244
 7962     __ ldr(tmp3, __ post(buf, 8));
 7963     __ eor(a17, a17, tmp3);
 7964     __ b(rounds24_preloop);
 7965 
 7966     __ BIND(shake128);
 7967     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7968     __ eor(a17, a17, tmp3);
 7969     __ eor(a18, a18, tmp2);
 7970     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7971     __ eor(a19, a19, tmp3);
 7972     __ eor(a20, a20, tmp2);
 7973     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7974 
 7975     __ BIND(sha3_512_or_sha3_384);
 7976     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7977     __ eor(a7, a7, tmp3);
 7978     __ eor(a8, a8, tmp2);
 7979     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7980 
 7981     // SHA3-384
 7982     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7983     __ eor(a9, a9, tmp3);
 7984     __ eor(a10, a10, tmp2);
 7985     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7986     __ eor(a11, a11, tmp3);
 7987     __ eor(a12, a12, tmp2);
 7988 
 7989     __ BIND(rounds24_preloop);
 7990     __ fmovs(v0, 24.0); // float loop counter,
 7991     __ fmovs(v1, 1.0);  // exact representation
 7992 
 7993     __ str(buf, Address(sp, 16));
 7994     __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
 7995 
 7996     __ BIND(loop_body);
 7997     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7998                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7999                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 8000                      tmp0, tmp1, tmp2);
 8001     __ fsubs(v0, v0, v1);
 8002     __ fcmps(v0, 0.0);
 8003     __ br(__ NE, loop_body);
 8004 
 8005     if (multi_block) {
 8006       __ ldrw(block_size, sp); // block_size
 8007       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 8008       __ addw(tmp2, tmp2, block_size);
 8009       __ cmpw(tmp2, tmp1);
 8010       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 8011       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 8012       __ br(Assembler::LE, sha3_loop);
 8013       __ movw(c_rarg0, tmp2); // return offset
 8014     }
 8015     if (can_use_fp && can_use_r18) {
 8016       __ ldp(r18_tls, state, Address(sp, 112));
 8017     } else {
 8018       __ ldr(state, Address(sp, 112));
 8019     }
 8020     // save calculated sha3 state
 8021     __ stp(a0, a1, Address(state));
 8022     __ stp(a2, a3, Address(state, 16));
 8023     __ stp(a4, a5, Address(state, 32));
 8024     __ stp(a6, a7, Address(state, 48));
 8025     __ stp(a8, a9, Address(state, 64));
 8026     __ stp(a10, a11, Address(state, 80));
 8027     __ stp(a12, a13, Address(state, 96));
 8028     __ stp(a14, a15, Address(state, 112));
 8029     __ stp(a16, a17, Address(state, 128));
 8030     __ stp(a18, a19, Address(state, 144));
 8031     __ stp(a20, a21, Address(state, 160));
 8032     __ stp(a22, a23, Address(state, 176));
 8033     __ str(a24, Address(state, 192));
 8034 
 8035     // restore required registers from stack
 8036     __ ldp(r19, r20, Address(sp, 32));
 8037     __ ldp(r21, r22, Address(sp, 48));
 8038     __ ldp(r23, r24, Address(sp, 64));
 8039     __ ldp(r25, r26, Address(sp, 80));
 8040     __ ldp(r27, r28, Address(sp, 96));
 8041     if (can_use_fp && can_use_r18) {
 8042       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 8043     } // else no need to recalculate rfp, since it wasn't changed
 8044 
 8045     __ leave();
 8046 
 8047     __ ret(lr);
 8048 
 8049     // record the stub entry and end
 8050     store_archive_data(stub_id, start, __ pc());
 8051 
 8052     return start;
 8053   }
 8054 
 8055   /**
 8056    *  Arguments:
 8057    *
 8058    * Inputs:
 8059    *   c_rarg0   - int crc
 8060    *   c_rarg1   - byte* buf
 8061    *   c_rarg2   - int length
 8062    *
 8063    * Output:
 8064    *       rax   - int crc result
 8065    */
 8066   address generate_updateBytesCRC32() {
 8067     assert(UseCRC32Intrinsics, "what are we doing here?");
 8068     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 8069     int entry_count = StubInfo::entry_count(stub_id);
 8070     assert(entry_count == 1, "sanity check");
 8071     address start = load_archive_data(stub_id);
 8072     if (start != nullptr) {
 8073       return start;
 8074     }
 8075     __ align(CodeEntryAlignment);
 8076     StubCodeMark mark(this, stub_id);
 8077 
 8078     start = __ pc();
 8079 
 8080     const Register crc   = c_rarg0;  // crc
 8081     const Register buf   = c_rarg1;  // source java byte array address
 8082     const Register len   = c_rarg2;  // length
 8083     const Register table0 = c_rarg3; // crc_table address
 8084     const Register table1 = c_rarg4;
 8085     const Register table2 = c_rarg5;
 8086     const Register table3 = c_rarg6;
 8087     const Register tmp3 = c_rarg7;
 8088 
 8089     BLOCK_COMMENT("Entry:");
 8090     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8091 
 8092     __ kernel_crc32(crc, buf, len,
 8093               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 8094 
 8095     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8096     __ ret(lr);
 8097 
 8098     // record the stub entry and end
 8099     store_archive_data(stub_id, start, __ pc());
 8100 
 8101     return start;
 8102   }
 8103 
 8104   /**
 8105    *  Arguments:
 8106    *
 8107    * Inputs:
 8108    *   c_rarg0   - int crc
 8109    *   c_rarg1   - byte* buf
 8110    *   c_rarg2   - int length
 8111    *   c_rarg3   - int* table
 8112    *
 8113    * Output:
 8114    *       r0   - int crc result
 8115    */
 8116   address generate_updateBytesCRC32C() {
 8117     assert(UseCRC32CIntrinsics, "what are we doing here?");
 8118     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 8119     int entry_count = StubInfo::entry_count(stub_id);
 8120     assert(entry_count == 1, "sanity check");
 8121     address start = load_archive_data(stub_id);
 8122     if (start != nullptr) {
 8123       return start;
 8124     }
 8125     __ align(CodeEntryAlignment);
 8126     StubCodeMark mark(this, stub_id);
 8127 
 8128     start = __ pc();
 8129 
 8130     const Register crc   = c_rarg0;  // crc
 8131     const Register buf   = c_rarg1;  // source java byte array address
 8132     const Register len   = c_rarg2;  // length
 8133     const Register table0 = c_rarg3; // crc_table address
 8134     const Register table1 = c_rarg4;
 8135     const Register table2 = c_rarg5;
 8136     const Register table3 = c_rarg6;
 8137     const Register tmp3 = c_rarg7;
 8138 
 8139     BLOCK_COMMENT("Entry:");
 8140     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8141 
 8142     __ kernel_crc32c(crc, buf, len,
 8143               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 8144 
 8145     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8146     __ ret(lr);
 8147 
 8148     // record the stub entry and end
 8149     store_archive_data(stub_id, start, __ pc());
 8150 
 8151     return start;
 8152   }
 8153 
 8154   /***
 8155    *  Arguments:
 8156    *
 8157    *  Inputs:
 8158    *   c_rarg0   - int   adler
 8159    *   c_rarg1   - byte* buff
 8160    *   c_rarg2   - int   len
 8161    *
 8162    * Output:
 8163    *   c_rarg0   - int adler result
 8164    */
 8165   address generate_updateBytesAdler32() {
 8166     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 8167     int entry_count = StubInfo::entry_count(stub_id);
 8168     assert(entry_count == 1, "sanity check");
 8169     address start = load_archive_data(stub_id);
 8170     if (start != nullptr) {
 8171       return start;
 8172     }
 8173     __ align(CodeEntryAlignment);
 8174     StubCodeMark mark(this, stub_id);
 8175     start = __ pc();
 8176 
 8177     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 8178 
 8179     // Aliases
 8180     Register adler  = c_rarg0;
 8181     Register s1     = c_rarg0;
 8182     Register s2     = c_rarg3;
 8183     Register buff   = c_rarg1;
 8184     Register len    = c_rarg2;
 8185     Register nmax  = r4;
 8186     Register base  = r5;
 8187     Register count = r6;
 8188     Register temp0 = rscratch1;
 8189     Register temp1 = rscratch2;
 8190     FloatRegister vbytes = v0;
 8191     FloatRegister vs1acc = v1;
 8192     FloatRegister vs2acc = v2;
 8193     FloatRegister vtable = v3;
 8194 
 8195     // Max number of bytes we can process before having to take the mod
 8196     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 8197     uint64_t BASE = 0xfff1;
 8198     uint64_t NMAX = 0x15B0;
 8199 
 8200     __ mov(base, BASE);
 8201     __ mov(nmax, NMAX);
 8202 
 8203     // Load accumulation coefficients for the upper 16 bits
 8204     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 8205     __ ld1(vtable, __ T16B, Address(temp0));
 8206 
 8207     // s1 is initialized to the lower 16 bits of adler
 8208     // s2 is initialized to the upper 16 bits of adler
 8209     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 8210     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 8211 
 8212     // The pipelined loop needs at least 16 elements for 1 iteration
 8213     // It does check this, but it is more effective to skip to the cleanup loop
 8214     __ cmp(len, (u1)16);
 8215     __ br(Assembler::HS, L_nmax);
 8216     __ cbz(len, L_combine);
 8217 
 8218     __ bind(L_simple_by1_loop);
 8219     __ ldrb(temp0, Address(__ post(buff, 1)));
 8220     __ add(s1, s1, temp0);
 8221     __ add(s2, s2, s1);
 8222     __ subs(len, len, 1);
 8223     __ br(Assembler::HI, L_simple_by1_loop);
 8224 
 8225     // s1 = s1 % BASE
 8226     __ subs(temp0, s1, base);
 8227     __ csel(s1, temp0, s1, Assembler::HS);
 8228 
 8229     // s2 = s2 % BASE
 8230     __ lsr(temp0, s2, 16);
 8231     __ lsl(temp1, temp0, 4);
 8232     __ sub(temp1, temp1, temp0);
 8233     __ add(s2, temp1, s2, ext::uxth);
 8234 
 8235     __ subs(temp0, s2, base);
 8236     __ csel(s2, temp0, s2, Assembler::HS);
 8237 
 8238     __ b(L_combine);
 8239 
 8240     __ bind(L_nmax);
 8241     __ subs(len, len, nmax);
 8242     __ sub(count, nmax, 16);
 8243     __ br(Assembler::LO, L_by16);
 8244 
 8245     __ bind(L_nmax_loop);
 8246 
 8247     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 8248                                       vbytes, vs1acc, vs2acc, vtable);
 8249 
 8250     __ subs(count, count, 16);
 8251     __ br(Assembler::HS, L_nmax_loop);
 8252 
 8253     // s1 = s1 % BASE
 8254     __ lsr(temp0, s1, 16);
 8255     __ lsl(temp1, temp0, 4);
 8256     __ sub(temp1, temp1, temp0);
 8257     __ add(temp1, temp1, s1, ext::uxth);
 8258 
 8259     __ lsr(temp0, temp1, 16);
 8260     __ lsl(s1, temp0, 4);
 8261     __ sub(s1, s1, temp0);
 8262     __ add(s1, s1, temp1, ext:: uxth);
 8263 
 8264     __ subs(temp0, s1, base);
 8265     __ csel(s1, temp0, s1, Assembler::HS);
 8266 
 8267     // s2 = s2 % BASE
 8268     __ lsr(temp0, s2, 16);
 8269     __ lsl(temp1, temp0, 4);
 8270     __ sub(temp1, temp1, temp0);
 8271     __ add(temp1, temp1, s2, ext::uxth);
 8272 
 8273     __ lsr(temp0, temp1, 16);
 8274     __ lsl(s2, temp0, 4);
 8275     __ sub(s2, s2, temp0);
 8276     __ add(s2, s2, temp1, ext:: uxth);
 8277 
 8278     __ subs(temp0, s2, base);
 8279     __ csel(s2, temp0, s2, Assembler::HS);
 8280 
 8281     __ subs(len, len, nmax);
 8282     __ sub(count, nmax, 16);
 8283     __ br(Assembler::HS, L_nmax_loop);
 8284 
 8285     __ bind(L_by16);
 8286     __ adds(len, len, count);
 8287     __ br(Assembler::LO, L_by1);
 8288 
 8289     __ bind(L_by16_loop);
 8290 
 8291     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 8292                                       vbytes, vs1acc, vs2acc, vtable);
 8293 
 8294     __ subs(len, len, 16);
 8295     __ br(Assembler::HS, L_by16_loop);
 8296 
 8297     __ bind(L_by1);
 8298     __ adds(len, len, 15);
 8299     __ br(Assembler::LO, L_do_mod);
 8300 
 8301     __ bind(L_by1_loop);
 8302     __ ldrb(temp0, Address(__ post(buff, 1)));
 8303     __ add(s1, temp0, s1);
 8304     __ add(s2, s2, s1);
 8305     __ subs(len, len, 1);
 8306     __ br(Assembler::HS, L_by1_loop);
 8307 
 8308     __ bind(L_do_mod);
 8309     // s1 = s1 % BASE
 8310     __ lsr(temp0, s1, 16);
 8311     __ lsl(temp1, temp0, 4);
 8312     __ sub(temp1, temp1, temp0);
 8313     __ add(temp1, temp1, s1, ext::uxth);
 8314 
 8315     __ lsr(temp0, temp1, 16);
 8316     __ lsl(s1, temp0, 4);
 8317     __ sub(s1, s1, temp0);
 8318     __ add(s1, s1, temp1, ext:: uxth);
 8319 
 8320     __ subs(temp0, s1, base);
 8321     __ csel(s1, temp0, s1, Assembler::HS);
 8322 
 8323     // s2 = s2 % BASE
 8324     __ lsr(temp0, s2, 16);
 8325     __ lsl(temp1, temp0, 4);
 8326     __ sub(temp1, temp1, temp0);
 8327     __ add(temp1, temp1, s2, ext::uxth);
 8328 
 8329     __ lsr(temp0, temp1, 16);
 8330     __ lsl(s2, temp0, 4);
 8331     __ sub(s2, s2, temp0);
 8332     __ add(s2, s2, temp1, ext:: uxth);
 8333 
 8334     __ subs(temp0, s2, base);
 8335     __ csel(s2, temp0, s2, Assembler::HS);
 8336 
 8337     // Combine lower bits and higher bits
 8338     __ bind(L_combine);
 8339     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 8340 
 8341     __ ret(lr);
 8342 
 8343     // record the stub entry and end
 8344     store_archive_data(stub_id, start, __ pc());
 8345 
 8346     return start;
 8347   }
 8348 
 8349   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 8350           Register temp0, Register temp1, FloatRegister vbytes,
 8351           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 8352     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 8353     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 8354     // In non-vectorized code, we update s1 and s2 as:
 8355     //   s1 <- s1 + b1
 8356     //   s2 <- s2 + s1
 8357     //   s1 <- s1 + b2
 8358     //   s2 <- s2 + b1
 8359     //   ...
 8360     //   s1 <- s1 + b16
 8361     //   s2 <- s2 + s1
 8362     // Putting above assignments together, we have:
 8363     //   s1_new = s1 + b1 + b2 + ... + b16
 8364     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 8365     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 8366     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 8367     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 8368 
 8369     // s2 = s2 + s1 * 16
 8370     __ add(s2, s2, s1, Assembler::LSL, 4);
 8371 
 8372     // vs1acc = b1 + b2 + b3 + ... + b16
 8373     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 8374     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 8375     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 8376     __ uaddlv(vs1acc, __ T16B, vbytes);
 8377     __ uaddlv(vs2acc, __ T8H, vs2acc);
 8378 
 8379     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 8380     __ fmovd(temp0, vs1acc);
 8381     __ fmovd(temp1, vs2acc);
 8382     __ add(s1, s1, temp0);
 8383     __ add(s2, s2, temp1);
 8384   }
 8385 
 8386   /**
 8387    *  Arguments:
 8388    *
 8389    *  Input:
 8390    *    c_rarg0   - x address
 8391    *    c_rarg1   - x length
 8392    *    c_rarg2   - y address
 8393    *    c_rarg3   - y length
 8394    *    c_rarg4   - z address
 8395    */
 8396   address generate_multiplyToLen() {
 8397     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 8398     int entry_count = StubInfo::entry_count(stub_id);
 8399     assert(entry_count == 1, "sanity check");
 8400     address start = load_archive_data(stub_id);
 8401     if (start != nullptr) {
 8402       return start;
 8403     }
 8404     __ align(CodeEntryAlignment);
 8405     StubCodeMark mark(this, stub_id);
 8406 
 8407     start = __ pc();
 8408     const Register x     = r0;
 8409     const Register xlen  = r1;
 8410     const Register y     = r2;
 8411     const Register ylen  = r3;
 8412     const Register z     = r4;
 8413 
 8414     const Register tmp0  = r5;
 8415     const Register tmp1  = r10;
 8416     const Register tmp2  = r11;
 8417     const Register tmp3  = r12;
 8418     const Register tmp4  = r13;
 8419     const Register tmp5  = r14;
 8420     const Register tmp6  = r15;
 8421     const Register tmp7  = r16;
 8422 
 8423     BLOCK_COMMENT("Entry:");
 8424     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8425     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 8426     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8427     __ ret(lr);
 8428 
 8429     // record the stub entry and end
 8430     store_archive_data(stub_id, start, __ pc());
 8431 
 8432     return start;
 8433   }
 8434 
 8435   address generate_squareToLen() {
 8436     // squareToLen algorithm for sizes 1..127 described in java code works
 8437     // faster than multiply_to_len on some CPUs and slower on others, but
 8438     // multiply_to_len shows a bit better overall results
 8439     StubId stub_id = StubId::stubgen_squareToLen_id;
 8440     int entry_count = StubInfo::entry_count(stub_id);
 8441     assert(entry_count == 1, "sanity check");
 8442     address start = load_archive_data(stub_id);
 8443     if (start != nullptr) {
 8444       return start;
 8445     }
 8446     __ align(CodeEntryAlignment);
 8447     StubCodeMark mark(this, stub_id);
 8448     start = __ pc();
 8449 
 8450     const Register x     = r0;
 8451     const Register xlen  = r1;
 8452     const Register z     = r2;
 8453     const Register y     = r4; // == x
 8454     const Register ylen  = r5; // == xlen
 8455 
 8456     const Register tmp0  = r3;
 8457     const Register tmp1  = r10;
 8458     const Register tmp2  = r11;
 8459     const Register tmp3  = r12;
 8460     const Register tmp4  = r13;
 8461     const Register tmp5  = r14;
 8462     const Register tmp6  = r15;
 8463     const Register tmp7  = r16;
 8464 
 8465     RegSet spilled_regs = RegSet::of(y, ylen);
 8466     BLOCK_COMMENT("Entry:");
 8467     __ enter();
 8468     __ push(spilled_regs, sp);
 8469     __ mov(y, x);
 8470     __ mov(ylen, xlen);
 8471     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 8472     __ pop(spilled_regs, sp);
 8473     __ leave();
 8474     __ ret(lr);
 8475 
 8476     // record the stub entry and end
 8477     store_archive_data(stub_id, start, __ pc());
 8478 
 8479     return start;
 8480   }
 8481 
 8482   address generate_mulAdd() {
 8483     StubId stub_id = StubId::stubgen_mulAdd_id;
 8484     int entry_count = StubInfo::entry_count(stub_id);
 8485     assert(entry_count == 1, "sanity check");
 8486     address start = load_archive_data(stub_id);
 8487     if (start != nullptr) {
 8488       return start;
 8489     }
 8490     __ align(CodeEntryAlignment);
 8491     StubCodeMark mark(this, stub_id);
 8492 
 8493     start = __ pc();
 8494 
 8495     const Register out     = r0;
 8496     const Register in      = r1;
 8497     const Register offset  = r2;
 8498     const Register len     = r3;
 8499     const Register k       = r4;
 8500 
 8501     BLOCK_COMMENT("Entry:");
 8502     __ enter();
 8503     __ mul_add(out, in, offset, len, k);
 8504     __ leave();
 8505     __ ret(lr);
 8506 
 8507     // record the stub entry and end
 8508     store_archive_data(stub_id, start, __ pc());
 8509 
 8510     return start;
 8511   }
 8512 
 8513   // Arguments:
 8514   //
 8515   // Input:
 8516   //   c_rarg0   - newArr address
 8517   //   c_rarg1   - oldArr address
 8518   //   c_rarg2   - newIdx
 8519   //   c_rarg3   - shiftCount
 8520   //   c_rarg4   - numIter
 8521   //
 8522   address generate_bigIntegerRightShift() {
 8523     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 8524     int entry_count = StubInfo::entry_count(stub_id);
 8525     assert(entry_count == 1, "sanity check");
 8526     address start = load_archive_data(stub_id);
 8527     if (start != nullptr) {
 8528       return start;
 8529     }
 8530     __ align(CodeEntryAlignment);
 8531     StubCodeMark mark(this, stub_id);
 8532     start = __ pc();
 8533 
 8534     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8535 
 8536     Register newArr        = c_rarg0;
 8537     Register oldArr        = c_rarg1;
 8538     Register newIdx        = c_rarg2;
 8539     Register shiftCount    = c_rarg3;
 8540     Register numIter       = c_rarg4;
 8541     Register idx           = numIter;
 8542 
 8543     Register newArrCur     = rscratch1;
 8544     Register shiftRevCount = rscratch2;
 8545     Register oldArrCur     = r13;
 8546     Register oldArrNext    = r14;
 8547 
 8548     FloatRegister oldElem0        = v0;
 8549     FloatRegister oldElem1        = v1;
 8550     FloatRegister newElem         = v2;
 8551     FloatRegister shiftVCount     = v3;
 8552     FloatRegister shiftVRevCount  = v4;
 8553 
 8554     __ cbz(idx, Exit);
 8555 
 8556     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8557 
 8558     // left shift count
 8559     __ movw(shiftRevCount, 32);
 8560     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8561 
 8562     // numIter too small to allow a 4-words SIMD loop, rolling back
 8563     __ cmp(numIter, (u1)4);
 8564     __ br(Assembler::LT, ShiftThree);
 8565 
 8566     __ dup(shiftVCount,    __ T4S, shiftCount);
 8567     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 8568     __ negr(shiftVCount,   __ T4S, shiftVCount);
 8569 
 8570     __ BIND(ShiftSIMDLoop);
 8571 
 8572     // Calculate the load addresses
 8573     __ sub(idx, idx, 4);
 8574     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8575     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8576     __ add(oldArrCur,  oldArrNext, 4);
 8577 
 8578     // Load 4 words and process
 8579     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 8580     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 8581     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8582     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8583     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8584     __ st1(newElem,   __ T4S,  Address(newArrCur));
 8585 
 8586     __ cmp(idx, (u1)4);
 8587     __ br(Assembler::LT, ShiftTwoLoop);
 8588     __ b(ShiftSIMDLoop);
 8589 
 8590     __ BIND(ShiftTwoLoop);
 8591     __ cbz(idx, Exit);
 8592     __ cmp(idx, (u1)1);
 8593     __ br(Assembler::EQ, ShiftOne);
 8594 
 8595     // Calculate the load addresses
 8596     __ sub(idx, idx, 2);
 8597     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8598     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8599     __ add(oldArrCur,  oldArrNext, 4);
 8600 
 8601     // Load 2 words and process
 8602     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 8603     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 8604     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 8605     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 8606     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 8607     __ st1(newElem,   __ T2S, Address(newArrCur));
 8608     __ b(ShiftTwoLoop);
 8609 
 8610     __ BIND(ShiftThree);
 8611     __ tbz(idx, 1, ShiftOne);
 8612     __ tbz(idx, 0, ShiftTwo);
 8613     __ ldrw(r10,  Address(oldArr, 12));
 8614     __ ldrw(r11,  Address(oldArr, 8));
 8615     __ lsrvw(r10, r10, shiftCount);
 8616     __ lslvw(r11, r11, shiftRevCount);
 8617     __ orrw(r12,  r10, r11);
 8618     __ strw(r12,  Address(newArr, 8));
 8619 
 8620     __ BIND(ShiftTwo);
 8621     __ ldrw(r10,  Address(oldArr, 8));
 8622     __ ldrw(r11,  Address(oldArr, 4));
 8623     __ lsrvw(r10, r10, shiftCount);
 8624     __ lslvw(r11, r11, shiftRevCount);
 8625     __ orrw(r12,  r10, r11);
 8626     __ strw(r12,  Address(newArr, 4));
 8627 
 8628     __ BIND(ShiftOne);
 8629     __ ldrw(r10,  Address(oldArr, 4));
 8630     __ ldrw(r11,  Address(oldArr));
 8631     __ lsrvw(r10, r10, shiftCount);
 8632     __ lslvw(r11, r11, shiftRevCount);
 8633     __ orrw(r12,  r10, r11);
 8634     __ strw(r12,  Address(newArr));
 8635 
 8636     __ BIND(Exit);
 8637     __ ret(lr);
 8638 
 8639     // record the stub entry and end
 8640     store_archive_data(stub_id, start, __ pc());
 8641 
 8642     return start;
 8643   }
 8644 
 8645   // Arguments:
 8646   //
 8647   // Input:
 8648   //   c_rarg0   - newArr address
 8649   //   c_rarg1   - oldArr address
 8650   //   c_rarg2   - newIdx
 8651   //   c_rarg3   - shiftCount
 8652   //   c_rarg4   - numIter
 8653   //
 8654   address generate_bigIntegerLeftShift() {
 8655     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8656     int entry_count = StubInfo::entry_count(stub_id);
 8657     assert(entry_count == 1, "sanity check");
 8658     address start = load_archive_data(stub_id);
 8659     if (start != nullptr) {
 8660       return start;
 8661     }
 8662     __ align(CodeEntryAlignment);
 8663     StubCodeMark mark(this, stub_id);
 8664     start = __ pc();
 8665 
 8666     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8667 
 8668     Register newArr        = c_rarg0;
 8669     Register oldArr        = c_rarg1;
 8670     Register newIdx        = c_rarg2;
 8671     Register shiftCount    = c_rarg3;
 8672     Register numIter       = c_rarg4;
 8673 
 8674     Register shiftRevCount = rscratch1;
 8675     Register oldArrNext    = rscratch2;
 8676 
 8677     FloatRegister oldElem0        = v0;
 8678     FloatRegister oldElem1        = v1;
 8679     FloatRegister newElem         = v2;
 8680     FloatRegister shiftVCount     = v3;
 8681     FloatRegister shiftVRevCount  = v4;
 8682 
 8683     __ cbz(numIter, Exit);
 8684 
 8685     __ add(oldArrNext, oldArr, 4);
 8686     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8687 
 8688     // right shift count
 8689     __ movw(shiftRevCount, 32);
 8690     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8691 
 8692     // numIter too small to allow a 4-words SIMD loop, rolling back
 8693     __ cmp(numIter, (u1)4);
 8694     __ br(Assembler::LT, ShiftThree);
 8695 
 8696     __ dup(shiftVCount,     __ T4S, shiftCount);
 8697     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8698     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8699 
 8700     __ BIND(ShiftSIMDLoop);
 8701 
 8702     // load 4 words and process
 8703     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8704     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8705     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8706     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8707     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8708     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8709     __ sub(numIter,   numIter, 4);
 8710 
 8711     __ cmp(numIter, (u1)4);
 8712     __ br(Assembler::LT, ShiftTwoLoop);
 8713     __ b(ShiftSIMDLoop);
 8714 
 8715     __ BIND(ShiftTwoLoop);
 8716     __ cbz(numIter, Exit);
 8717     __ cmp(numIter, (u1)1);
 8718     __ br(Assembler::EQ, ShiftOne);
 8719 
 8720     // load 2 words and process
 8721     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8722     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8723     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8724     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8725     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8726     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8727     __ sub(numIter,   numIter, 2);
 8728     __ b(ShiftTwoLoop);
 8729 
 8730     __ BIND(ShiftThree);
 8731     __ ldrw(r10,  __ post(oldArr, 4));
 8732     __ ldrw(r11,  __ post(oldArrNext, 4));
 8733     __ lslvw(r10, r10, shiftCount);
 8734     __ lsrvw(r11, r11, shiftRevCount);
 8735     __ orrw(r12,  r10, r11);
 8736     __ strw(r12,  __ post(newArr, 4));
 8737     __ tbz(numIter, 1, Exit);
 8738     __ tbz(numIter, 0, ShiftOne);
 8739 
 8740     __ BIND(ShiftTwo);
 8741     __ ldrw(r10,  __ post(oldArr, 4));
 8742     __ ldrw(r11,  __ post(oldArrNext, 4));
 8743     __ lslvw(r10, r10, shiftCount);
 8744     __ lsrvw(r11, r11, shiftRevCount);
 8745     __ orrw(r12,  r10, r11);
 8746     __ strw(r12,  __ post(newArr, 4));
 8747 
 8748     __ BIND(ShiftOne);
 8749     __ ldrw(r10,  Address(oldArr));
 8750     __ ldrw(r11,  Address(oldArrNext));
 8751     __ lslvw(r10, r10, shiftCount);
 8752     __ lsrvw(r11, r11, shiftRevCount);
 8753     __ orrw(r12,  r10, r11);
 8754     __ strw(r12,  Address(newArr));
 8755 
 8756     __ BIND(Exit);
 8757     __ ret(lr);
 8758 
 8759     // record the stub entry and end
 8760     store_archive_data(stub_id, start, __ pc());
 8761 
 8762     return start;
 8763   }
 8764 
 8765   address generate_count_positives(address &count_positives_long) {
 8766     StubId stub_id = StubId::stubgen_count_positives_id;
 8767     GrowableArray<address> entries;
 8768     int entry_count = StubInfo::entry_count(stub_id);
 8769     // We have an extra entry for count_positives_long.
 8770     assert(entry_count == 2, "sanity check");
 8771     address start = load_archive_data(stub_id, &entries);
 8772     if (start != nullptr) {
 8773       assert(entries.length() == 1,
 8774              "unexpected extra entry count %d", entries.length());
 8775       count_positives_long = entries.at(0);
 8776       return start;
 8777     }
 8778     const u1 large_loop_size = 64;
 8779     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8780     int dcache_line = VM_Version::dcache_line_size();
 8781 
 8782     Register ary1 = r1, len = r2, result = r0;
 8783 
 8784     __ align(CodeEntryAlignment);
 8785     StubCodeMark mark(this, stub_id);
 8786 
 8787     address entry = __ pc();
 8788 
 8789     __ enter();
 8790     // precondition: a copy of len is already in result
 8791     // __ mov(result, len);
 8792 
 8793   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8794         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8795 
 8796   __ cmp(len, (u1)15);
 8797   __ br(Assembler::GT, LEN_OVER_15);
 8798   // The only case when execution falls into this code is when pointer is near
 8799   // the end of memory page and we have to avoid reading next page
 8800   __ add(ary1, ary1, len);
 8801   __ subs(len, len, 8);
 8802   __ br(Assembler::GT, LEN_OVER_8);
 8803   __ ldr(rscratch2, Address(ary1, -8));
 8804   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8805   __ lsrv(rscratch2, rscratch2, rscratch1);
 8806   __ tst(rscratch2, UPPER_BIT_MASK);
 8807   __ csel(result, zr, result, Assembler::NE);
 8808   __ leave();
 8809   __ ret(lr);
 8810   __ bind(LEN_OVER_8);
 8811   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8812   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8813   __ tst(rscratch2, UPPER_BIT_MASK);
 8814   __ br(Assembler::NE, RET_NO_POP);
 8815   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8816   __ lsrv(rscratch1, rscratch1, rscratch2);
 8817   __ tst(rscratch1, UPPER_BIT_MASK);
 8818   __ bind(RET_NO_POP);
 8819   __ csel(result, zr, result, Assembler::NE);
 8820   __ leave();
 8821   __ ret(lr);
 8822 
 8823   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8824   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8825 
 8826   count_positives_long = __ pc(); // 2nd entry point
 8827   entries.append(count_positives_long);
 8828 
 8829   __ enter();
 8830 
 8831   __ bind(LEN_OVER_15);
 8832     __ push(spilled_regs, sp);
 8833     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8834     __ cbz(rscratch2, ALIGNED);
 8835     __ ldp(tmp6, tmp1, Address(ary1));
 8836     __ mov(tmp5, 16);
 8837     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8838     __ add(ary1, ary1, rscratch1);
 8839     __ orr(tmp6, tmp6, tmp1);
 8840     __ tst(tmp6, UPPER_BIT_MASK);
 8841     __ br(Assembler::NE, RET_ADJUST);
 8842     __ sub(len, len, rscratch1);
 8843 
 8844   __ bind(ALIGNED);
 8845     __ cmp(len, large_loop_size);
 8846     __ br(Assembler::LT, CHECK_16);
 8847     // Perform 16-byte load as early return in pre-loop to handle situation
 8848     // when initially aligned large array has negative values at starting bytes,
 8849     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8850     // slower. Cases with negative bytes further ahead won't be affected that
 8851     // much. In fact, it'll be faster due to early loads, less instructions and
 8852     // less branches in LARGE_LOOP.
 8853     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8854     __ sub(len, len, 16);
 8855     __ orr(tmp6, tmp6, tmp1);
 8856     __ tst(tmp6, UPPER_BIT_MASK);
 8857     __ br(Assembler::NE, RET_ADJUST_16);
 8858     __ cmp(len, large_loop_size);
 8859     __ br(Assembler::LT, CHECK_16);
 8860 
 8861     if (SoftwarePrefetchHintDistance >= 0
 8862         && SoftwarePrefetchHintDistance >= dcache_line) {
 8863       // initial prefetch
 8864       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8865     }
 8866   __ bind(LARGE_LOOP);
 8867     if (SoftwarePrefetchHintDistance >= 0) {
 8868       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8869     }
 8870     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8871     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8872     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8873     // instructions per cycle and have less branches, but this approach disables
 8874     // early return, thus, all 64 bytes are loaded and checked every time.
 8875     __ ldp(tmp2, tmp3, Address(ary1));
 8876     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8877     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8878     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8879     __ add(ary1, ary1, large_loop_size);
 8880     __ sub(len, len, large_loop_size);
 8881     __ orr(tmp2, tmp2, tmp3);
 8882     __ orr(tmp4, tmp4, tmp5);
 8883     __ orr(rscratch1, rscratch1, rscratch2);
 8884     __ orr(tmp6, tmp6, tmp1);
 8885     __ orr(tmp2, tmp2, tmp4);
 8886     __ orr(rscratch1, rscratch1, tmp6);
 8887     __ orr(tmp2, tmp2, rscratch1);
 8888     __ tst(tmp2, UPPER_BIT_MASK);
 8889     __ br(Assembler::NE, RET_ADJUST_LONG);
 8890     __ cmp(len, large_loop_size);
 8891     __ br(Assembler::GE, LARGE_LOOP);
 8892 
 8893   __ bind(CHECK_16); // small 16-byte load pre-loop
 8894     __ cmp(len, (u1)16);
 8895     __ br(Assembler::LT, POST_LOOP16);
 8896 
 8897   __ bind(LOOP16); // small 16-byte load loop
 8898     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8899     __ sub(len, len, 16);
 8900     __ orr(tmp2, tmp2, tmp3);
 8901     __ tst(tmp2, UPPER_BIT_MASK);
 8902     __ br(Assembler::NE, RET_ADJUST_16);
 8903     __ cmp(len, (u1)16);
 8904     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8905 
 8906   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8907     __ cmp(len, (u1)8);
 8908     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8909     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8910     __ tst(tmp3, UPPER_BIT_MASK);
 8911     __ br(Assembler::NE, RET_ADJUST);
 8912     __ sub(len, len, 8);
 8913 
 8914   __ bind(POST_LOOP16_LOAD_TAIL);
 8915     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8916     __ ldr(tmp1, Address(ary1));
 8917     __ mov(tmp2, 64);
 8918     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8919     __ lslv(tmp1, tmp1, tmp4);
 8920     __ tst(tmp1, UPPER_BIT_MASK);
 8921     __ br(Assembler::NE, RET_ADJUST);
 8922     // Fallthrough
 8923 
 8924   __ bind(RET_LEN);
 8925     __ pop(spilled_regs, sp);
 8926     __ leave();
 8927     __ ret(lr);
 8928 
 8929     // difference result - len is the count of guaranteed to be
 8930     // positive bytes
 8931 
 8932   __ bind(RET_ADJUST_LONG);
 8933     __ add(len, len, (u1)(large_loop_size - 16));
 8934   __ bind(RET_ADJUST_16);
 8935     __ add(len, len, 16);
 8936   __ bind(RET_ADJUST);
 8937     __ pop(spilled_regs, sp);
 8938     __ leave();
 8939     __ sub(result, result, len);
 8940     __ ret(lr);
 8941 
 8942     // record the stub entry and end plus the extra entry
 8943     store_archive_data(stub_id, entry, __ pc(), &entries);
 8944 
 8945     return entry;
 8946   }
 8947 
 8948   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8949         bool usePrefetch, Label &NOT_EQUAL) {
 8950     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8951         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8952         tmp7 = r12, tmp8 = r13;
 8953     Label LOOP;
 8954 
 8955     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8956     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8957     __ bind(LOOP);
 8958     if (usePrefetch) {
 8959       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8960       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8961     }
 8962     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8963     __ eor(tmp1, tmp1, tmp2);
 8964     __ eor(tmp3, tmp3, tmp4);
 8965     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8966     __ orr(tmp1, tmp1, tmp3);
 8967     __ cbnz(tmp1, NOT_EQUAL);
 8968     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8969     __ eor(tmp5, tmp5, tmp6);
 8970     __ eor(tmp7, tmp7, tmp8);
 8971     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8972     __ orr(tmp5, tmp5, tmp7);
 8973     __ cbnz(tmp5, NOT_EQUAL);
 8974     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8975     __ eor(tmp1, tmp1, tmp2);
 8976     __ eor(tmp3, tmp3, tmp4);
 8977     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8978     __ orr(tmp1, tmp1, tmp3);
 8979     __ cbnz(tmp1, NOT_EQUAL);
 8980     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8981     __ eor(tmp5, tmp5, tmp6);
 8982     __ sub(cnt1, cnt1, 8 * wordSize);
 8983     __ eor(tmp7, tmp7, tmp8);
 8984     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8985     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8986     // cmp) because subs allows an unlimited range of immediate operand.
 8987     __ subs(tmp6, cnt1, loopThreshold);
 8988     __ orr(tmp5, tmp5, tmp7);
 8989     __ cbnz(tmp5, NOT_EQUAL);
 8990     __ br(__ GE, LOOP);
 8991     // post-loop
 8992     __ eor(tmp1, tmp1, tmp2);
 8993     __ eor(tmp3, tmp3, tmp4);
 8994     __ orr(tmp1, tmp1, tmp3);
 8995     __ sub(cnt1, cnt1, 2 * wordSize);
 8996     __ cbnz(tmp1, NOT_EQUAL);
 8997   }
 8998 
 8999   void generate_large_array_equals_loop_simd(int loopThreshold,
 9000         bool usePrefetch, Label &NOT_EQUAL) {
 9001     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 9002         tmp2 = rscratch2;
 9003     Label LOOP;
 9004 
 9005     __ bind(LOOP);
 9006     if (usePrefetch) {
 9007       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 9008       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 9009     }
 9010     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 9011     __ sub(cnt1, cnt1, 8 * wordSize);
 9012     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 9013     __ subs(tmp1, cnt1, loopThreshold);
 9014     __ eor(v0, __ T16B, v0, v4);
 9015     __ eor(v1, __ T16B, v1, v5);
 9016     __ eor(v2, __ T16B, v2, v6);
 9017     __ eor(v3, __ T16B, v3, v7);
 9018     __ orr(v0, __ T16B, v0, v1);
 9019     __ orr(v1, __ T16B, v2, v3);
 9020     __ orr(v0, __ T16B, v0, v1);
 9021     __ umov(tmp1, v0, __ D, 0);
 9022     __ umov(tmp2, v0, __ D, 1);
 9023     __ orr(tmp1, tmp1, tmp2);
 9024     __ cbnz(tmp1, NOT_EQUAL);
 9025     __ br(__ GE, LOOP);
 9026   }
 9027 
 9028   // a1 = r1 - array1 address
 9029   // a2 = r2 - array2 address
 9030   // result = r0 - return value. Already contains "false"
 9031   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 9032   // r3-r5 are reserved temporary registers
 9033   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 9034   address generate_large_array_equals() {
 9035     StubId stub_id = StubId::stubgen_large_array_equals_id;
 9036     int entry_count = StubInfo::entry_count(stub_id);
 9037     assert(entry_count == 1, "sanity check");
 9038     address start = load_archive_data(stub_id);
 9039     if (start != nullptr) {
 9040       return start;
 9041     }
 9042     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 9043         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 9044         tmp7 = r12, tmp8 = r13;
 9045     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 9046         SMALL_LOOP, POST_LOOP;
 9047     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 9048     // calculate if at least 32 prefetched bytes are used
 9049     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 9050     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 9051     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 9052     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 9053         tmp5, tmp6, tmp7, tmp8);
 9054 
 9055     __ align(CodeEntryAlignment);
 9056 
 9057     StubCodeMark mark(this, stub_id);
 9058 
 9059     address entry = __ pc();
 9060     __ enter();
 9061     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 9062     // also advance pointers to use post-increment instead of pre-increment
 9063     __ add(a1, a1, wordSize);
 9064     __ add(a2, a2, wordSize);
 9065     if (AvoidUnalignedAccesses) {
 9066       // both implementations (SIMD/nonSIMD) are using relatively large load
 9067       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 9068       // on some CPUs in case of address is not at least 16-byte aligned.
 9069       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 9070       // load if needed at least for 1st address and make if 16-byte aligned.
 9071       Label ALIGNED16;
 9072       __ tbz(a1, 3, ALIGNED16);
 9073       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 9074       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 9075       __ sub(cnt1, cnt1, wordSize);
 9076       __ eor(tmp1, tmp1, tmp2);
 9077       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 9078       __ bind(ALIGNED16);
 9079     }
 9080     if (UseSIMDForArrayEquals) {
 9081       if (SoftwarePrefetchHintDistance >= 0) {
 9082         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 9083         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 9084         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 9085             /* prfm = */ true, NOT_EQUAL);
 9086         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 9087         __ br(__ LT, TAIL);
 9088       }
 9089       __ bind(NO_PREFETCH_LARGE_LOOP);
 9090       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 9091           /* prfm = */ false, NOT_EQUAL);
 9092     } else {
 9093       __ push(spilled_regs, sp);
 9094       if (SoftwarePrefetchHintDistance >= 0) {
 9095         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 9096         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 9097         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 9098             /* prfm = */ true, NOT_EQUAL);
 9099         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 9100         __ br(__ LT, TAIL);
 9101       }
 9102       __ bind(NO_PREFETCH_LARGE_LOOP);
 9103       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 9104           /* prfm = */ false, NOT_EQUAL);
 9105     }
 9106     __ bind(TAIL);
 9107       __ cbz(cnt1, EQUAL);
 9108       __ subs(cnt1, cnt1, wordSize);
 9109       __ br(__ LE, POST_LOOP);
 9110     __ bind(SMALL_LOOP);
 9111       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 9112       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 9113       __ subs(cnt1, cnt1, wordSize);
 9114       __ eor(tmp1, tmp1, tmp2);
 9115       __ cbnz(tmp1, NOT_EQUAL);
 9116       __ br(__ GT, SMALL_LOOP);
 9117     __ bind(POST_LOOP);
 9118       __ ldr(tmp1, Address(a1, cnt1));
 9119       __ ldr(tmp2, Address(a2, cnt1));
 9120       __ eor(tmp1, tmp1, tmp2);
 9121       __ cbnz(tmp1, NOT_EQUAL);
 9122     __ bind(EQUAL);
 9123       __ mov(result, true);
 9124     __ bind(NOT_EQUAL);
 9125       if (!UseSIMDForArrayEquals) {
 9126         __ pop(spilled_regs, sp);
 9127       }
 9128     __ bind(NOT_EQUAL_NO_POP);
 9129     __ leave();
 9130     __ ret(lr);
 9131 
 9132     // record the stub entry and end
 9133     store_archive_data(stub_id, entry, __ pc());
 9134 
 9135     return entry;
 9136   }
 9137 
 9138   // result = r0 - return value. Contains initial hashcode value on entry.
 9139   // ary = r1 - array address
 9140   // cnt = r2 - elements count
 9141   // Clobbers: v0-v13, rscratch1, rscratch2
 9142   address generate_large_arrays_hashcode(BasicType eltype) {
 9143     StubId stub_id;
 9144     switch (eltype) {
 9145     case T_BOOLEAN:
 9146       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 9147       break;
 9148     case T_BYTE:
 9149       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 9150       break;
 9151     case T_CHAR:
 9152       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 9153       break;
 9154     case T_SHORT:
 9155       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 9156       break;
 9157     case T_INT:
 9158       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 9159       break;
 9160     default:
 9161       stub_id = StubId::NO_STUBID;
 9162       ShouldNotReachHere();
 9163     };
 9164     int entry_count = StubInfo::entry_count(stub_id);
 9165     assert(entry_count == 1, "sanity check");
 9166     address start = load_archive_data(stub_id);
 9167     if (start != nullptr) {
 9168       return start;
 9169     }
 9170     const Register result = r0, ary = r1, cnt = r2;
 9171     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 9172     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 9173     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 9174     const FloatRegister vpowm = v13;
 9175 
 9176     ARRAYS_HASHCODE_REGISTERS;
 9177 
 9178     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 9179 
 9180     unsigned int vf; // vectorization factor
 9181     bool multiply_by_halves;
 9182     Assembler::SIMD_Arrangement load_arrangement;
 9183     switch (eltype) {
 9184     case T_BOOLEAN:
 9185     case T_BYTE:
 9186       load_arrangement = Assembler::T8B;
 9187       multiply_by_halves = true;
 9188       vf = 8;
 9189       break;
 9190     case T_CHAR:
 9191     case T_SHORT:
 9192       load_arrangement = Assembler::T8H;
 9193       multiply_by_halves = true;
 9194       vf = 8;
 9195       break;
 9196     case T_INT:
 9197       load_arrangement = Assembler::T4S;
 9198       multiply_by_halves = false;
 9199       vf = 4;
 9200       break;
 9201     default:
 9202       ShouldNotReachHere();
 9203     }
 9204 
 9205     // Unroll factor
 9206     const unsigned uf = 4;
 9207 
 9208     // Effective vectorization factor
 9209     const unsigned evf = vf * uf;
 9210 
 9211     __ align(CodeEntryAlignment);
 9212 
 9213     StubCodeMark mark(this, stub_id);
 9214 
 9215     address entry = __ pc();
 9216     __ enter();
 9217 
 9218     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 9219     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 9220     // value shouldn't change throughout both loops.
 9221     __ movw(rscratch1, intpow(31U, 3));
 9222     __ mov(vpow, Assembler::S, 0, rscratch1);
 9223     __ movw(rscratch1, intpow(31U, 2));
 9224     __ mov(vpow, Assembler::S, 1, rscratch1);
 9225     __ movw(rscratch1, intpow(31U, 1));
 9226     __ mov(vpow, Assembler::S, 2, rscratch1);
 9227     __ movw(rscratch1, intpow(31U, 0));
 9228     __ mov(vpow, Assembler::S, 3, rscratch1);
 9229 
 9230     __ mov(vmul0, Assembler::T16B, 0);
 9231     __ mov(vmul0, Assembler::S, 3, result);
 9232 
 9233     __ andr(rscratch2, cnt, (uf - 1) * vf);
 9234     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 9235 
 9236     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 9237     __ mov(vpowm, Assembler::S, 0, rscratch1);
 9238 
 9239     // SMALL LOOP
 9240     __ bind(SMALL_LOOP);
 9241 
 9242     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 9243     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9244     __ subsw(rscratch2, rscratch2, vf);
 9245 
 9246     if (load_arrangement == Assembler::T8B) {
 9247       // Extend 8B to 8H to be able to use vector multiply
 9248       // instructions
 9249       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 9250       if (is_signed_subword_type(eltype)) {
 9251         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9252       } else {
 9253         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9254       }
 9255     }
 9256 
 9257     switch (load_arrangement) {
 9258     case Assembler::T4S:
 9259       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 9260       break;
 9261     case Assembler::T8B:
 9262     case Assembler::T8H:
 9263       assert(is_subword_type(eltype), "subword type expected");
 9264       if (is_signed_subword_type(eltype)) {
 9265         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9266       } else {
 9267         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9268       }
 9269       break;
 9270     default:
 9271       __ should_not_reach_here();
 9272     }
 9273 
 9274     // Process the upper half of a vector
 9275     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 9276       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9277       if (is_signed_subword_type(eltype)) {
 9278         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9279       } else {
 9280         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9281       }
 9282     }
 9283 
 9284     __ br(Assembler::HI, SMALL_LOOP);
 9285 
 9286     // SMALL LOOP'S EPILOQUE
 9287     __ lsr(rscratch2, cnt, exact_log2(evf));
 9288     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 9289 
 9290     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 9291     __ addv(vmul0, Assembler::T4S, vmul0);
 9292     __ umov(result, vmul0, Assembler::S, 0);
 9293 
 9294     // TAIL
 9295     __ bind(TAIL);
 9296 
 9297     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 9298     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 9299     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 9300     __ andr(rscratch2, cnt, vf - 1);
 9301     __ bind(TAIL_SHORTCUT);
 9302     __ adr(rscratch1, BR_BASE);
 9303     // For Cortex-A53 offset is 4 because 2 nops are generated.
 9304     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 9305     __ movw(rscratch2, 0x1f);
 9306     __ br(rscratch1);
 9307 
 9308     for (size_t i = 0; i < vf - 1; ++i) {
 9309       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 9310                                    eltype);
 9311       __ maddw(result, result, rscratch2, rscratch1);
 9312       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 9313       // Generate 2nd nop to have 4 instructions per iteration.
 9314       if (VM_Version::supports_a53mac()) {
 9315         __ nop();
 9316       }
 9317     }
 9318     __ bind(BR_BASE);
 9319 
 9320     __ leave();
 9321     __ ret(lr);
 9322 
 9323     // LARGE LOOP
 9324     __ bind(LARGE_LOOP_PREHEADER);
 9325 
 9326     __ lsr(rscratch2, cnt, exact_log2(evf));
 9327 
 9328     if (multiply_by_halves) {
 9329       // 31^4 - multiplier between lower and upper parts of a register
 9330       __ movw(rscratch1, intpow(31U, vf / 2));
 9331       __ mov(vpowm, Assembler::S, 1, rscratch1);
 9332       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 9333       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 9334       __ mov(vpowm, Assembler::S, 0, rscratch1);
 9335     } else {
 9336       // 31^16
 9337       __ movw(rscratch1, intpow(31U, evf));
 9338       __ mov(vpowm, Assembler::S, 0, rscratch1);
 9339     }
 9340 
 9341     __ mov(vmul3, Assembler::T16B, 0);
 9342     __ mov(vmul2, Assembler::T16B, 0);
 9343     __ mov(vmul1, Assembler::T16B, 0);
 9344 
 9345     __ bind(LARGE_LOOP);
 9346 
 9347     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 9348     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 9349     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 9350     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9351 
 9352     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 9353            Address(__ post(ary, evf * type2aelembytes(eltype))));
 9354 
 9355     if (load_arrangement == Assembler::T8B) {
 9356       // Extend 8B to 8H to be able to use vector multiply
 9357       // instructions
 9358       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 9359       if (is_signed_subword_type(eltype)) {
 9360         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 9361         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 9362         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 9363         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9364       } else {
 9365         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 9366         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 9367         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 9368         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9369       }
 9370     }
 9371 
 9372     switch (load_arrangement) {
 9373     case Assembler::T4S:
 9374       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 9375       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 9376       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 9377       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 9378       break;
 9379     case Assembler::T8B:
 9380     case Assembler::T8H:
 9381       assert(is_subword_type(eltype), "subword type expected");
 9382       if (is_signed_subword_type(eltype)) {
 9383         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 9384         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 9385         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 9386         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9387       } else {
 9388         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 9389         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 9390         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 9391         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9392       }
 9393       break;
 9394     default:
 9395       __ should_not_reach_here();
 9396     }
 9397 
 9398     // Process the upper half of a vector
 9399     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 9400       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 9401       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 9402       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 9403       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 9404       if (is_signed_subword_type(eltype)) {
 9405         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 9406         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 9407         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 9408         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9409       } else {
 9410         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 9411         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 9412         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 9413         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9414       }
 9415     }
 9416 
 9417     __ subsw(rscratch2, rscratch2, 1);
 9418     __ br(Assembler::HI, LARGE_LOOP);
 9419 
 9420     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 9421     __ addv(vmul3, Assembler::T4S, vmul3);
 9422     __ umov(result, vmul3, Assembler::S, 0);
 9423 
 9424     __ mov(rscratch2, intpow(31U, vf));
 9425 
 9426     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 9427     __ addv(vmul2, Assembler::T4S, vmul2);
 9428     __ umov(rscratch1, vmul2, Assembler::S, 0);
 9429     __ maddw(result, result, rscratch2, rscratch1);
 9430 
 9431     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 9432     __ addv(vmul1, Assembler::T4S, vmul1);
 9433     __ umov(rscratch1, vmul1, Assembler::S, 0);
 9434     __ maddw(result, result, rscratch2, rscratch1);
 9435 
 9436     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 9437     __ addv(vmul0, Assembler::T4S, vmul0);
 9438     __ umov(rscratch1, vmul0, Assembler::S, 0);
 9439     __ maddw(result, result, rscratch2, rscratch1);
 9440 
 9441     __ andr(rscratch2, cnt, vf - 1);
 9442     __ cbnz(rscratch2, TAIL_SHORTCUT);
 9443 
 9444     __ leave();
 9445     __ ret(lr);
 9446 
 9447     // record the stub entry and end
 9448     store_archive_data(stub_id, entry, __ pc());
 9449 
 9450     return entry;
 9451   }
 9452 
 9453   address generate_dsin_dcos(bool isCos) {
 9454     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 9455     int entry_count = StubInfo::entry_count(stub_id);
 9456     assert(entry_count == 1, "sanity check");
 9457     address start = load_archive_data(stub_id);
 9458     if (start != nullptr) {
 9459       return start;
 9460     }
 9461     __ align(CodeEntryAlignment);
 9462     StubCodeMark mark(this, stub_id);
 9463     start = __ pc();
 9464     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 9465         (address)StubRoutines::aarch64::_two_over_pi,
 9466         (address)StubRoutines::aarch64::_pio2,
 9467         (address)StubRoutines::aarch64::_dsin_coef,
 9468         (address)StubRoutines::aarch64::_dcos_coef);
 9469 
 9470     // record the stub entry and end
 9471     store_archive_data(stub_id, start, __ pc());
 9472 
 9473     return start;
 9474   }
 9475 
 9476   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 9477   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 9478       Label &DIFF2) {
 9479     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 9480     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 9481 
 9482     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 9483     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 9484     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 9485     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 9486 
 9487     __ fmovd(tmpL, vtmp3);
 9488     __ eor(rscratch2, tmp3, tmpL);
 9489     __ cbnz(rscratch2, DIFF2);
 9490 
 9491     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9492     __ umov(tmpL, vtmp3, __ D, 1);
 9493     __ eor(rscratch2, tmpU, tmpL);
 9494     __ cbnz(rscratch2, DIFF1);
 9495 
 9496     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 9497     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 9498     __ fmovd(tmpL, vtmp);
 9499     __ eor(rscratch2, tmp3, tmpL);
 9500     __ cbnz(rscratch2, DIFF2);
 9501 
 9502     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9503     __ umov(tmpL, vtmp, __ D, 1);
 9504     __ eor(rscratch2, tmpU, tmpL);
 9505     __ cbnz(rscratch2, DIFF1);
 9506   }
 9507 
 9508   // r0  = result
 9509   // r1  = str1
 9510   // r2  = cnt1
 9511   // r3  = str2
 9512   // r4  = cnt2
 9513   // r10 = tmp1
 9514   // r11 = tmp2
 9515   address generate_compare_long_string_different_encoding(bool isLU) {
 9516     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 9517     int entry_count = StubInfo::entry_count(stub_id);
 9518     assert(entry_count == 1, "sanity check");
 9519     address start = load_archive_data(stub_id);
 9520     if (start != nullptr) {
 9521       return start;
 9522     }
 9523     __ align(CodeEntryAlignment);
 9524     StubCodeMark mark(this, stub_id);
 9525     address entry = __ pc();
 9526     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 9527         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 9528         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 9529     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9530         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 9531     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 9532     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 9533 
 9534     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 9535 
 9536     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 9537     // cnt2 == amount of characters left to compare
 9538     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 9539     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 9540     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 9541     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 9542     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 9543     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 9544     __ eor(rscratch2, tmp1, tmp2);
 9545     __ mov(rscratch1, tmp2);
 9546     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 9547     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 9548              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 9549     __ push(spilled_regs, sp);
 9550     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 9551     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 9552 
 9553     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9554 
 9555     if (SoftwarePrefetchHintDistance >= 0) {
 9556       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 9557       __ br(__ LT, NO_PREFETCH);
 9558       __ bind(LARGE_LOOP_PREFETCH);
 9559         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 9560         __ mov(tmp4, 2);
 9561         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 9562         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 9563           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9564           __ subs(tmp4, tmp4, 1);
 9565           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 9566           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 9567           __ mov(tmp4, 2);
 9568         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 9569           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9570           __ subs(tmp4, tmp4, 1);
 9571           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 9572           __ sub(cnt2, cnt2, 64);
 9573           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 9574           __ br(__ GE, LARGE_LOOP_PREFETCH);
 9575     }
 9576     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 9577     __ bind(NO_PREFETCH);
 9578     __ subs(cnt2, cnt2, 16);
 9579     __ br(__ LT, TAIL);
 9580     __ align(OptoLoopAlignment);
 9581     __ bind(SMALL_LOOP); // smaller loop
 9582       __ subs(cnt2, cnt2, 16);
 9583       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9584       __ br(__ GE, SMALL_LOOP);
 9585       __ cmn(cnt2, (u1)16);
 9586       __ br(__ EQ, LOAD_LAST);
 9587     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 9588       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 9589       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 9590       __ ldr(tmp3, Address(cnt1, -8));
 9591       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 9592       __ b(LOAD_LAST);
 9593     __ bind(DIFF2);
 9594       __ mov(tmpU, tmp3);
 9595     __ bind(DIFF1);
 9596       __ pop(spilled_regs, sp);
 9597       __ b(CALCULATE_DIFFERENCE);
 9598     __ bind(LOAD_LAST);
 9599       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 9600       // No need to load it again
 9601       __ mov(tmpU, tmp3);
 9602       __ pop(spilled_regs, sp);
 9603 
 9604       // tmp2 points to the address of the last 4 Latin1 characters right now
 9605       __ ldrs(vtmp, Address(tmp2));
 9606       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 9607       __ fmovd(tmpL, vtmp);
 9608 
 9609       __ eor(rscratch2, tmpU, tmpL);
 9610       __ cbz(rscratch2, DONE);
 9611 
 9612     // Find the first different characters in the longwords and
 9613     // compute their difference.
 9614     __ bind(CALCULATE_DIFFERENCE);
 9615       __ rev(rscratch2, rscratch2);
 9616       __ clz(rscratch2, rscratch2);
 9617       __ andr(rscratch2, rscratch2, -16);
 9618       __ lsrv(tmp1, tmp1, rscratch2);
 9619       __ uxthw(tmp1, tmp1);
 9620       __ lsrv(rscratch1, rscratch1, rscratch2);
 9621       __ uxthw(rscratch1, rscratch1);
 9622       __ subw(result, tmp1, rscratch1);
 9623     __ bind(DONE);
 9624       __ ret(lr);
 9625 
 9626       // record the stub entry and end
 9627       store_archive_data(stub_id, entry, __ pc());
 9628 
 9629       return entry;
 9630   }
 9631 
 9632   // r0 = input (float16)
 9633   // v0 = result (float)
 9634   // v1 = temporary float register
 9635   address generate_float16ToFloat() {
 9636     StubId stub_id = StubId::stubgen_hf2f_id;
 9637     int entry_count = StubInfo::entry_count(stub_id);
 9638     assert(entry_count == 1, "sanity check");
 9639     address start = load_archive_data(stub_id);
 9640     if (start != nullptr) {
 9641       return start;
 9642     }
 9643     __ align(CodeEntryAlignment);
 9644     StubCodeMark mark(this, stub_id);
 9645     address entry = __ pc();
 9646     BLOCK_COMMENT("Entry:");
 9647     __ flt16_to_flt(v0, r0, v1);
 9648     __ ret(lr);
 9649 
 9650     // record the stub entry and end
 9651     store_archive_data(stub_id, entry, __ pc());
 9652 
 9653     return entry;
 9654   }
 9655 
 9656   // v0 = input (float)
 9657   // r0 = result (float16)
 9658   // v1 = temporary float register
 9659   address generate_floatToFloat16() {
 9660     StubId stub_id = StubId::stubgen_f2hf_id;
 9661     int entry_count = StubInfo::entry_count(stub_id);
 9662     assert(entry_count == 1, "sanity check");
 9663     address start = load_archive_data(stub_id);
 9664     if (start != nullptr) {
 9665       return start;
 9666     }
 9667     __ align(CodeEntryAlignment);
 9668     StubCodeMark mark(this, stub_id);
 9669     address entry = __ pc();
 9670     BLOCK_COMMENT("Entry:");
 9671     __ flt_to_flt16(r0, v0, v1);
 9672     __ ret(lr);
 9673 
 9674     // record the stub entry and end
 9675     store_archive_data(stub_id, entry, __ pc());
 9676 
 9677     return entry;
 9678   }
 9679 
 9680   address generate_method_entry_barrier() {
 9681     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 9682     int entry_count = StubInfo::entry_count(stub_id);
 9683     assert(entry_count == 1, "sanity check");
 9684     address start = load_archive_data(stub_id);
 9685     if (start != nullptr) {
 9686       return start;
 9687     }
 9688     __ align(CodeEntryAlignment);
 9689     StubCodeMark mark(this, stub_id);
 9690 
 9691     Label deoptimize_label;
 9692 
 9693     start = __ pc();
 9694 
 9695     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 9696 
 9697     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 9698       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 9699       // We can get here despite the nmethod being good, if we have not
 9700       // yet applied our cross modification fence (or data fence).
 9701       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 9702       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 9703       __ ldrw(rscratch2, rscratch2);
 9704       __ strw(rscratch2, thread_epoch_addr);
 9705       __ isb();
 9706       __ membar(__ LoadLoad);
 9707     }
 9708 
 9709     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 9710 
 9711     __ enter();
 9712     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 9713 
 9714     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 9715 
 9716     __ push_call_clobbered_registers();
 9717 
 9718     __ mov(c_rarg0, rscratch2);
 9719     __ call_VM_leaf
 9720          (CAST_FROM_FN_PTR
 9721           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 9722 
 9723     __ reset_last_Java_frame(true);
 9724 
 9725     __ mov(rscratch1, r0);
 9726 
 9727     __ pop_call_clobbered_registers();
 9728 
 9729     __ cbnz(rscratch1, deoptimize_label);
 9730 
 9731     __ leave();
 9732     __ ret(lr);
 9733 
 9734     __ BIND(deoptimize_label);
 9735 
 9736     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9737     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9738 
 9739     __ mov(sp, rscratch1);
 9740     __ br(rscratch2);
 9741 
 9742     // record the stub entry and end
 9743     store_archive_data(stub_id, start, __ pc());
 9744 
 9745     return start;
 9746   }
 9747 
 9748   // r0  = result
 9749   // r1  = str1
 9750   // r2  = cnt1
 9751   // r3  = str2
 9752   // r4  = cnt2
 9753   // r10 = tmp1
 9754   // r11 = tmp2
 9755   address generate_compare_long_string_same_encoding(bool isLL) {
 9756     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9757     int entry_count = StubInfo::entry_count(stub_id);
 9758     assert(entry_count == 1, "sanity check");
 9759     address start = load_archive_data(stub_id);
 9760     if (start != nullptr) {
 9761       return start;
 9762     }
 9763     __ align(CodeEntryAlignment);
 9764     StubCodeMark mark(this, stub_id);
 9765     address entry = __ pc();
 9766     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9767         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9768 
 9769     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9770 
 9771     // exit from large loop when less than 64 bytes left to read or we're about
 9772     // to prefetch memory behind array border
 9773     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9774 
 9775     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9776     __ eor(rscratch2, tmp1, tmp2);
 9777     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9778 
 9779     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9780     // update pointers, because of previous read
 9781     __ add(str1, str1, wordSize);
 9782     __ add(str2, str2, wordSize);
 9783     if (SoftwarePrefetchHintDistance >= 0) {
 9784       __ align(OptoLoopAlignment);
 9785       __ bind(LARGE_LOOP_PREFETCH);
 9786         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9787         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9788 
 9789         for (int i = 0; i < 4; i++) {
 9790           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9791           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9792           __ cmp(tmp1, tmp2);
 9793           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9794           __ br(Assembler::NE, DIFF);
 9795         }
 9796         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9797         __ add(str1, str1, 64);
 9798         __ add(str2, str2, 64);
 9799         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9800         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9801         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9802     }
 9803 
 9804     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9805     __ br(Assembler::LE, LESS16);
 9806     __ align(OptoLoopAlignment);
 9807     __ bind(LOOP_COMPARE16);
 9808       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9809       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9810       __ cmp(tmp1, tmp2);
 9811       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9812       __ br(Assembler::NE, DIFF);
 9813       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9814       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9815       __ br(Assembler::LT, LESS16);
 9816 
 9817       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9818       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9819       __ cmp(tmp1, tmp2);
 9820       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9821       __ br(Assembler::NE, DIFF);
 9822       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9823       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9824       __ br(Assembler::GE, LOOP_COMPARE16);
 9825       __ cbz(cnt2, LENGTH_DIFF);
 9826 
 9827     __ bind(LESS16);
 9828       // each 8 compare
 9829       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9830       __ br(Assembler::LE, LESS8);
 9831       __ ldr(tmp1, Address(__ post(str1, 8)));
 9832       __ ldr(tmp2, Address(__ post(str2, 8)));
 9833       __ eor(rscratch2, tmp1, tmp2);
 9834       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9835       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9836 
 9837     __ bind(LESS8); // directly load last 8 bytes
 9838       if (!isLL) {
 9839         __ add(cnt2, cnt2, cnt2);
 9840       }
 9841       __ ldr(tmp1, Address(str1, cnt2));
 9842       __ ldr(tmp2, Address(str2, cnt2));
 9843       __ eor(rscratch2, tmp1, tmp2);
 9844       __ cbz(rscratch2, LENGTH_DIFF);
 9845       __ b(CAL_DIFFERENCE);
 9846 
 9847     __ bind(DIFF);
 9848       __ cmp(tmp1, tmp2);
 9849       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9850       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9851       // reuse rscratch2 register for the result of eor instruction
 9852       __ eor(rscratch2, tmp1, tmp2);
 9853 
 9854     __ bind(CAL_DIFFERENCE);
 9855       __ rev(rscratch2, rscratch2);
 9856       __ clz(rscratch2, rscratch2);
 9857       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9858       __ lsrv(tmp1, tmp1, rscratch2);
 9859       __ lsrv(tmp2, tmp2, rscratch2);
 9860       if (isLL) {
 9861         __ uxtbw(tmp1, tmp1);
 9862         __ uxtbw(tmp2, tmp2);
 9863       } else {
 9864         __ uxthw(tmp1, tmp1);
 9865         __ uxthw(tmp2, tmp2);
 9866       }
 9867       __ subw(result, tmp1, tmp2);
 9868 
 9869     __ bind(LENGTH_DIFF);
 9870       __ ret(lr);
 9871 
 9872     // record the stub entry and end
 9873     store_archive_data(stub_id, entry, __ pc());
 9874 
 9875     return entry;
 9876   }
 9877 
 9878   enum string_compare_mode {
 9879     LL,
 9880     LU,
 9881     UL,
 9882     UU,
 9883   };
 9884 
 9885   // The following registers are declared in aarch64.ad
 9886   // r0  = result
 9887   // r1  = str1
 9888   // r2  = cnt1
 9889   // r3  = str2
 9890   // r4  = cnt2
 9891   // r10 = tmp1
 9892   // r11 = tmp2
 9893   // z0  = ztmp1
 9894   // z1  = ztmp2
 9895   // p0  = pgtmp1
 9896   // p1  = pgtmp2
 9897   address generate_compare_long_string_sve(string_compare_mode mode) {
 9898     StubId stub_id;
 9899     switch (mode) {
 9900       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9901       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9902       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9903       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9904       default: ShouldNotReachHere();
 9905     }
 9906     int entry_count = StubInfo::entry_count(stub_id);
 9907     assert(entry_count == 1, "sanity check");
 9908     address start = load_archive_data(stub_id);
 9909     if (start != nullptr) {
 9910       return start;
 9911     }
 9912     __ align(CodeEntryAlignment);
 9913     StubCodeMark mark(this, stub_id);
 9914     address entry = __ pc();
 9915     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9916              tmp1 = r10, tmp2 = r11;
 9917 
 9918     Label LOOP, DONE, MISMATCH;
 9919     Register vec_len = tmp1;
 9920     Register idx = tmp2;
 9921     // The minimum of the string lengths has been stored in cnt2.
 9922     Register cnt = cnt2;
 9923     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9924     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9925 
 9926 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9927     switch (mode) {                                                            \
 9928       case LL:                                                                 \
 9929         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9930         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9931         break;                                                                 \
 9932       case LU:                                                                 \
 9933         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9934         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9935         break;                                                                 \
 9936       case UL:                                                                 \
 9937         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9938         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9939         break;                                                                 \
 9940       case UU:                                                                 \
 9941         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9942         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9943         break;                                                                 \
 9944       default:                                                                 \
 9945         ShouldNotReachHere();                                                  \
 9946     }
 9947 
 9948     __ mov(idx, 0);
 9949     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9950 
 9951     if (mode == LL) {
 9952       __ sve_cntb(vec_len);
 9953     } else {
 9954       __ sve_cnth(vec_len);
 9955     }
 9956 
 9957     __ sub(rscratch1, cnt, vec_len);
 9958 
 9959     __ bind(LOOP);
 9960 
 9961       // main loop
 9962       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9963       __ add(idx, idx, vec_len);
 9964       // Compare strings.
 9965       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9966       __ br(__ NE, MISMATCH);
 9967       __ cmp(idx, rscratch1);
 9968       __ br(__ LT, LOOP);
 9969 
 9970     // post loop, last iteration
 9971     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9972 
 9973     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9974     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9975     __ br(__ EQ, DONE);
 9976 
 9977     __ bind(MISMATCH);
 9978 
 9979     // Crop the vector to find its location.
 9980     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9981     // Extract the first different characters of each string.
 9982     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9983     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9984 
 9985     // Compute the difference of the first different characters.
 9986     __ sub(result, rscratch1, rscratch2);
 9987 
 9988     __ bind(DONE);
 9989     __ ret(lr);
 9990 #undef LOAD_PAIR
 9991 
 9992     // record the stub entry and end
 9993     store_archive_data(stub_id, entry, __ pc());
 9994 
 9995     return entry;
 9996   }
 9997 
 9998   void generate_compare_long_strings() {
 9999     if (UseSVE == 0) {
10000       StubRoutines::aarch64::_compare_long_string_LL
10001           = generate_compare_long_string_same_encoding(true);
10002       StubRoutines::aarch64::_compare_long_string_UU
10003           = generate_compare_long_string_same_encoding(false);
10004       StubRoutines::aarch64::_compare_long_string_LU
10005           = generate_compare_long_string_different_encoding(true);
10006       StubRoutines::aarch64::_compare_long_string_UL
10007           = generate_compare_long_string_different_encoding(false);
10008     } else {
10009       StubRoutines::aarch64::_compare_long_string_LL
10010           = generate_compare_long_string_sve(LL);
10011       StubRoutines::aarch64::_compare_long_string_UU
10012           = generate_compare_long_string_sve(UU);
10013       StubRoutines::aarch64::_compare_long_string_LU
10014           = generate_compare_long_string_sve(LU);
10015       StubRoutines::aarch64::_compare_long_string_UL
10016           = generate_compare_long_string_sve(UL);
10017     }
10018   }
10019 
10020   // R0 = result
10021   // R1 = str2
10022   // R2 = cnt1
10023   // R3 = str1
10024   // R4 = cnt2
10025   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
10026   //
10027   // This generic linear code use few additional ideas, which makes it faster:
10028   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
10029   // in order to skip initial loading(help in systems with 1 ld pipeline)
10030   // 2) we can use "fast" algorithm of finding single character to search for
10031   // first symbol with less branches(1 branch per each loaded register instead
10032   // of branch for each symbol), so, this is where constants like
10033   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
10034   // 3) after loading and analyzing 1st register of source string, it can be
10035   // used to search for every 1st character entry, saving few loads in
10036   // comparison with "simplier-but-slower" implementation
10037   // 4) in order to avoid lots of push/pop operations, code below is heavily
10038   // re-using/re-initializing/compressing register values, which makes code
10039   // larger and a bit less readable, however, most of extra operations are
10040   // issued during loads or branches, so, penalty is minimal
10041   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
10042     StubId stub_id;
10043     if (str1_isL) {
10044       if (str2_isL) {
10045         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
10046       } else {
10047         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
10048       }
10049     } else {
10050       if (str2_isL) {
10051         ShouldNotReachHere();
10052       } else {
10053         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
10054       }
10055     }
10056     int entry_count = StubInfo::entry_count(stub_id);
10057     assert(entry_count == 1, "sanity check");
10058     address start = load_archive_data(stub_id);
10059     if (start != nullptr) {
10060       return start;
10061     }
10062     __ align(CodeEntryAlignment);
10063     StubCodeMark mark(this, stub_id);
10064     address entry = __ pc();
10065 
10066     int str1_chr_size = str1_isL ? 1 : 2;
10067     int str2_chr_size = str2_isL ? 1 : 2;
10068     int str1_chr_shift = str1_isL ? 0 : 1;
10069     int str2_chr_shift = str2_isL ? 0 : 1;
10070     bool isL = str1_isL && str2_isL;
10071    // parameters
10072     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
10073     // temporary registers
10074     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
10075     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
10076     // redefinitions
10077     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
10078 
10079     __ push(spilled_regs, sp);
10080     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
10081         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
10082         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
10083         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
10084         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
10085         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
10086     // Read whole register from str1. It is safe, because length >=8 here
10087     __ ldr(ch1, Address(str1));
10088     // Read whole register from str2. It is safe, because length >=8 here
10089     __ ldr(ch2, Address(str2));
10090     __ sub(cnt2, cnt2, cnt1);
10091     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
10092     if (str1_isL != str2_isL) {
10093       __ eor(v0, __ T16B, v0, v0);
10094     }
10095     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
10096     __ mul(first, first, tmp1);
10097     // check if we have less than 1 register to check
10098     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
10099     if (str1_isL != str2_isL) {
10100       __ fmovd(v1, ch1);
10101     }
10102     __ br(__ LE, L_SMALL);
10103     __ eor(ch2, first, ch2);
10104     if (str1_isL != str2_isL) {
10105       __ zip1(v1, __ T16B, v1, v0);
10106     }
10107     __ sub(tmp2, ch2, tmp1);
10108     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10109     __ bics(tmp2, tmp2, ch2);
10110     if (str1_isL != str2_isL) {
10111       __ fmovd(ch1, v1);
10112     }
10113     __ br(__ NE, L_HAS_ZERO);
10114     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10115     __ add(result, result, wordSize/str2_chr_size);
10116     __ add(str2, str2, wordSize);
10117     __ br(__ LT, L_POST_LOOP);
10118     __ BIND(L_LOOP);
10119       __ ldr(ch2, Address(str2));
10120       __ eor(ch2, first, ch2);
10121       __ sub(tmp2, ch2, tmp1);
10122       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10123       __ bics(tmp2, tmp2, ch2);
10124       __ br(__ NE, L_HAS_ZERO);
10125     __ BIND(L_LOOP_PROCEED);
10126       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10127       __ add(str2, str2, wordSize);
10128       __ add(result, result, wordSize/str2_chr_size);
10129       __ br(__ GE, L_LOOP);
10130     __ BIND(L_POST_LOOP);
10131       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
10132       __ br(__ LE, NOMATCH);
10133       __ ldr(ch2, Address(str2));
10134       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10135       __ eor(ch2, first, ch2);
10136       __ sub(tmp2, ch2, tmp1);
10137       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10138       __ mov(tmp4, -1); // all bits set
10139       __ b(L_SMALL_PROCEED);
10140     __ align(OptoLoopAlignment);
10141     __ BIND(L_SMALL);
10142       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10143       __ eor(ch2, first, ch2);
10144       if (str1_isL != str2_isL) {
10145         __ zip1(v1, __ T16B, v1, v0);
10146       }
10147       __ sub(tmp2, ch2, tmp1);
10148       __ mov(tmp4, -1); // all bits set
10149       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10150       if (str1_isL != str2_isL) {
10151         __ fmovd(ch1, v1); // move converted 4 symbols
10152       }
10153     __ BIND(L_SMALL_PROCEED);
10154       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
10155       __ bic(tmp2, tmp2, ch2);
10156       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
10157       __ rbit(tmp2, tmp2);
10158       __ br(__ EQ, NOMATCH);
10159     __ BIND(L_SMALL_HAS_ZERO_LOOP);
10160       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
10161       __ cmp(cnt1, u1(wordSize/str2_chr_size));
10162       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
10163       if (str2_isL) { // LL
10164         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10165         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10166         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10167         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10168         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10169       } else {
10170         __ mov(ch2, 0xE); // all bits in byte set except last one
10171         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10172         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10173         __ lslv(tmp2, tmp2, tmp4);
10174         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10175         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10176         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10177         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10178       }
10179       __ cmp(ch1, ch2);
10180       __ mov(tmp4, wordSize/str2_chr_size);
10181       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10182     __ BIND(L_SMALL_CMP_LOOP);
10183       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10184                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10185       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10186                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10187       __ add(tmp4, tmp4, 1);
10188       __ cmp(tmp4, cnt1);
10189       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
10190       __ cmp(first, ch2);
10191       __ br(__ EQ, L_SMALL_CMP_LOOP);
10192     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
10193       __ cbz(tmp2, NOMATCH); // no more matches. exit
10194       __ clz(tmp4, tmp2);
10195       __ add(result, result, 1); // advance index
10196       __ add(str2, str2, str2_chr_size); // advance pointer
10197       __ b(L_SMALL_HAS_ZERO_LOOP);
10198     __ align(OptoLoopAlignment);
10199     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
10200       __ cmp(first, ch2);
10201       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10202       __ b(DONE);
10203     __ align(OptoLoopAlignment);
10204     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
10205       if (str2_isL) { // LL
10206         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10207         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10208         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10209         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10210         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10211       } else {
10212         __ mov(ch2, 0xE); // all bits in byte set except last one
10213         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10214         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10215         __ lslv(tmp2, tmp2, tmp4);
10216         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10217         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10218         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10219         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10220       }
10221       __ cmp(ch1, ch2);
10222       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10223       __ b(DONE);
10224     __ align(OptoLoopAlignment);
10225     __ BIND(L_HAS_ZERO);
10226       __ rbit(tmp2, tmp2);
10227       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
10228       // Now, perform compression of counters(cnt2 and cnt1) into one register.
10229       // It's fine because both counters are 32bit and are not changed in this
10230       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
10231       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
10232       __ sub(result, result, 1);
10233     __ BIND(L_HAS_ZERO_LOOP);
10234       __ mov(cnt1, wordSize/str2_chr_size);
10235       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10236       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
10237       if (str2_isL) {
10238         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10239         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10240         __ lslv(tmp2, tmp2, tmp4);
10241         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10242         __ add(tmp4, tmp4, 1);
10243         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10244         __ lsl(tmp2, tmp2, 1);
10245         __ mov(tmp4, wordSize/str2_chr_size);
10246       } else {
10247         __ mov(ch2, 0xE);
10248         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10249         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10250         __ lslv(tmp2, tmp2, tmp4);
10251         __ add(tmp4, tmp4, 1);
10252         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10253         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10254         __ lsl(tmp2, tmp2, 1);
10255         __ mov(tmp4, wordSize/str2_chr_size);
10256         __ sub(str2, str2, str2_chr_size);
10257       }
10258       __ cmp(ch1, ch2);
10259       __ mov(tmp4, wordSize/str2_chr_size);
10260       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10261     __ BIND(L_CMP_LOOP);
10262       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10263                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10264       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10265                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10266       __ add(tmp4, tmp4, 1);
10267       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10268       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
10269       __ cmp(cnt1, ch2);
10270       __ br(__ EQ, L_CMP_LOOP);
10271     __ BIND(L_CMP_LOOP_NOMATCH);
10272       // here we're not matched
10273       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
10274       __ clz(tmp4, tmp2);
10275       __ add(str2, str2, str2_chr_size); // advance pointer
10276       __ b(L_HAS_ZERO_LOOP);
10277     __ align(OptoLoopAlignment);
10278     __ BIND(L_CMP_LOOP_LAST_CMP);
10279       __ cmp(cnt1, ch2);
10280       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10281       __ b(DONE);
10282     __ align(OptoLoopAlignment);
10283     __ BIND(L_CMP_LOOP_LAST_CMP2);
10284       if (str2_isL) {
10285         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10286         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10287         __ lslv(tmp2, tmp2, tmp4);
10288         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10289         __ add(tmp4, tmp4, 1);
10290         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10291         __ lsl(tmp2, tmp2, 1);
10292       } else {
10293         __ mov(ch2, 0xE);
10294         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10295         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10296         __ lslv(tmp2, tmp2, tmp4);
10297         __ add(tmp4, tmp4, 1);
10298         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10299         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10300         __ lsl(tmp2, tmp2, 1);
10301         __ sub(str2, str2, str2_chr_size);
10302       }
10303       __ cmp(ch1, ch2);
10304       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10305       __ b(DONE);
10306     __ align(OptoLoopAlignment);
10307     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
10308       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
10309       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
10310       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
10311       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
10312       // result by analyzed characters value, so, we can just reset lower bits
10313       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
10314       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
10315       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
10316       // index of last analyzed substring inside current octet. So, str2 in at
10317       // respective start address. We need to advance it to next octet
10318       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
10319       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
10320       __ bfm(result, zr, 0, 2 - str2_chr_shift);
10321       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
10322       __ movw(cnt2, cnt2);
10323       __ b(L_LOOP_PROCEED);
10324     __ align(OptoLoopAlignment);
10325     __ BIND(NOMATCH);
10326       __ mov(result, -1);
10327     __ BIND(DONE);
10328       __ pop(spilled_regs, sp);
10329       __ ret(lr);
10330 
10331     // record the stub entry and end
10332     store_archive_data(stub_id, entry, __ pc());
10333 
10334     return entry;
10335   }
10336 
10337   void generate_string_indexof_stubs() {
10338     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
10339     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
10340     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
10341   }
10342 
10343   void inflate_and_store_2_fp_registers(bool generatePrfm,
10344       FloatRegister src1, FloatRegister src2) {
10345     Register dst = r1;
10346     __ zip1(v1, __ T16B, src1, v0);
10347     __ zip2(v2, __ T16B, src1, v0);
10348     if (generatePrfm) {
10349       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
10350     }
10351     __ zip1(v3, __ T16B, src2, v0);
10352     __ zip2(v4, __ T16B, src2, v0);
10353     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
10354   }
10355 
10356   // R0 = src
10357   // R1 = dst
10358   // R2 = len
10359   // R3 = len >> 3
10360   // V0 = 0
10361   // v1 = loaded 8 bytes
10362   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
10363   address generate_large_byte_array_inflate() {
10364     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
10365     int entry_count = StubInfo::entry_count(stub_id);
10366     assert(entry_count == 1, "sanity check");
10367     address start = load_archive_data(stub_id);
10368     if (start != nullptr) {
10369       return start;
10370     }
10371     __ align(CodeEntryAlignment);
10372     StubCodeMark mark(this, stub_id);
10373     address entry = __ pc();
10374     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
10375     Register src = r0, dst = r1, len = r2, octetCounter = r3;
10376     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
10377 
10378     // do one more 8-byte read to have address 16-byte aligned in most cases
10379     // also use single store instruction
10380     __ ldrd(v2, __ post(src, 8));
10381     __ sub(octetCounter, octetCounter, 2);
10382     __ zip1(v1, __ T16B, v1, v0);
10383     __ zip1(v2, __ T16B, v2, v0);
10384     __ st1(v1, v2, __ T16B, __ post(dst, 32));
10385     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10386     __ subs(rscratch1, octetCounter, large_loop_threshold);
10387     __ br(__ LE, LOOP_START);
10388     __ b(LOOP_PRFM_START);
10389     __ bind(LOOP_PRFM);
10390       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10391     __ bind(LOOP_PRFM_START);
10392       __ prfm(Address(src, SoftwarePrefetchHintDistance));
10393       __ sub(octetCounter, octetCounter, 8);
10394       __ subs(rscratch1, octetCounter, large_loop_threshold);
10395       inflate_and_store_2_fp_registers(true, v3, v4);
10396       inflate_and_store_2_fp_registers(true, v5, v6);
10397       __ br(__ GT, LOOP_PRFM);
10398       __ cmp(octetCounter, (u1)8);
10399       __ br(__ LT, DONE);
10400     __ bind(LOOP);
10401       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10402       __ bind(LOOP_START);
10403       __ sub(octetCounter, octetCounter, 8);
10404       __ cmp(octetCounter, (u1)8);
10405       inflate_and_store_2_fp_registers(false, v3, v4);
10406       inflate_and_store_2_fp_registers(false, v5, v6);
10407       __ br(__ GE, LOOP);
10408     __ bind(DONE);
10409       __ ret(lr);
10410 
10411     // record the stub entry and end
10412     store_archive_data(stub_id, entry, __ pc());
10413 
10414     return entry;
10415   }
10416 
10417   /**
10418    *  Arguments:
10419    *
10420    *  Input:
10421    *  c_rarg0   - current state address
10422    *  c_rarg1   - H key address
10423    *  c_rarg2   - data address
10424    *  c_rarg3   - number of blocks
10425    *
10426    *  Output:
10427    *  Updated state at c_rarg0
10428    */
10429   address generate_ghash_processBlocks_small() {
10430     // Bafflingly, GCM uses little-endian for the byte order, but
10431     // big-endian for the bit order.  For example, the polynomial 1 is
10432     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
10433     //
10434     // So, we must either reverse the bytes in each word and do
10435     // everything big-endian or reverse the bits in each byte and do
10436     // it little-endian.  On AArch64 it's more idiomatic to reverse
10437     // the bits in each byte (we have an instruction, RBIT, to do
10438     // that) and keep the data in little-endian bit order through the
10439     // calculation, bit-reversing the inputs and outputs.
10440 
10441     StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
10442     int entry_count = StubInfo::entry_count(stub_id);
10443     assert(entry_count == 1, "sanity check");
10444     address start = load_archive_data(stub_id);
10445     if (start != nullptr) {
10446       return start;
10447     }
10448     __ align(CodeEntryAlignment);
10449     StubCodeMark mark(this, stub_id);
10450     Label polynomial; // local data generated at end of stub
10451     start = __ pc();
10452 
10453     Register state   = c_rarg0;
10454     Register subkeyH = c_rarg1;
10455     Register data    = c_rarg2;
10456     Register blocks  = c_rarg3;
10457 
10458     FloatRegister vzr = v30;
10459     __ eor(vzr, __ T16B, vzr, vzr); // zero register
10460 
10461     __ adr(rscratch1, polynomial);
10462     __ ldrq(v24, rscratch1);    // The field polynomial
10463 
10464     __ ldrq(v0, Address(state));
10465     __ ldrq(v1, Address(subkeyH));
10466 
10467     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
10468     __ rbit(v0, __ T16B, v0);
10469     __ rev64(v1, __ T16B, v1);
10470     __ rbit(v1, __ T16B, v1);
10471 
10472     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
10473     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
10474 
10475     {
10476       Label L_ghash_loop;
10477       __ bind(L_ghash_loop);
10478 
10479       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
10480                                                  // reversing each byte
10481       __ rbit(v2, __ T16B, v2);
10482       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
10483 
10484       // Multiply state in v2 by subkey in v1
10485       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
10486                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
10487                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
10488       // Reduce v7:v5 by the field polynomial
10489       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
10490 
10491       __ sub(blocks, blocks, 1);
10492       __ cbnz(blocks, L_ghash_loop);
10493     }
10494 
10495     // The bit-reversed result is at this point in v0
10496     __ rev64(v0, __ T16B, v0);
10497     __ rbit(v0, __ T16B, v0);
10498 
10499     __ st1(v0, __ T16B, state);
10500     __ ret(lr);
10501 
10502     // bind label and generate local polynomial data
10503     __ align(wordSize * 2);
10504     __ bind(polynomial);
10505     __ emit_int64(0x87);  // The low-order bits of the field
10506                           // polynomial (i.e. p = z^7+z^2+z+1)
10507                           // repeated in the low and high parts of a
10508                           // 128-bit vector
10509     __ emit_int64(0x87);
10510 
10511     // record the stub entry and end
10512     store_archive_data(stub_id, start, __ pc());
10513 
10514     return start;
10515   }
10516 
10517   address generate_ghash_processBlocks(address small) {
10518     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
10519     int entry_count = StubInfo::entry_count(stub_id);
10520     assert(entry_count == 1, "sanity check");
10521     address start = load_archive_data(stub_id);
10522     if (start != nullptr) {
10523       return start;
10524     }
10525     Label polynomial;           // local data generated after stub
10526     __ align(CodeEntryAlignment);
10527     StubCodeMark mark(this, stub_id);
10528     start = __ pc();
10529 
10530     Register state   = c_rarg0;
10531     Register subkeyH = c_rarg1;
10532     Register data    = c_rarg2;
10533     Register blocks  = c_rarg3;
10534 
10535     const int unroll = 4;
10536 
10537     __ cmp(blocks, (unsigned char)(unroll * 2));
10538     __ br(__ LT, small);
10539 
10540     if (unroll > 1) {
10541     // Save state before entering routine
10542       __ sub(sp, sp, 4 * 16);
10543       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
10544       __ sub(sp, sp, 4 * 16);
10545       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
10546     }
10547 
10548     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
10549 
10550     if (unroll > 1) {
10551       // And restore state
10552       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
10553       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
10554     }
10555 
10556     __ cmp(blocks, (unsigned char)0);
10557     __ br(__ GT, small);
10558 
10559     __ ret(lr);
10560 
10561     // bind label and generate polynomial data
10562     __ align(wordSize * 2);
10563     __ bind(polynomial);
10564     __ emit_int64(0x87);  // The low-order bits of the field
10565                           // polynomial (i.e. p = z^7+z^2+z+1)
10566                           // repeated in the low and high parts of a
10567                           // 128-bit vector
10568     __ emit_int64(0x87);
10569 
10570     // record the stub entry and end
10571     store_archive_data(stub_id, start, __ pc());
10572 
10573     return start;
10574   }
10575 
10576   void generate_base64_encode_simdround(Register src, Register dst,
10577         FloatRegister codec, u8 size) {
10578 
10579     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
10580     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
10581     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
10582 
10583     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10584 
10585     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
10586 
10587     __ ushr(ind0, arrangement, in0,  2);
10588 
10589     __ ushr(ind1, arrangement, in1,  2);
10590     __ shl(in0,   arrangement, in0,  6);
10591     __ orr(ind1,  arrangement, ind1, in0);
10592     __ ushr(ind1, arrangement, ind1, 2);
10593 
10594     __ ushr(ind2, arrangement, in2,  4);
10595     __ shl(in1,   arrangement, in1,  4);
10596     __ orr(ind2,  arrangement, in1,  ind2);
10597     __ ushr(ind2, arrangement, ind2, 2);
10598 
10599     __ shl(ind3,  arrangement, in2,  2);
10600     __ ushr(ind3, arrangement, ind3, 2);
10601 
10602     __ tbl(out0,  arrangement, codec,  4, ind0);
10603     __ tbl(out1,  arrangement, codec,  4, ind1);
10604     __ tbl(out2,  arrangement, codec,  4, ind2);
10605     __ tbl(out3,  arrangement, codec,  4, ind3);
10606 
10607     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
10608   }
10609 
10610    /**
10611    *  Arguments:
10612    *
10613    *  Input:
10614    *  c_rarg0   - src_start
10615    *  c_rarg1   - src_offset
10616    *  c_rarg2   - src_length
10617    *  c_rarg3   - dest_start
10618    *  c_rarg4   - dest_offset
10619    *  c_rarg5   - isURL
10620    *
10621    */
10622   address generate_base64_encodeBlock() {
10623 
10624     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
10625     int entry_count = StubInfo::entry_count(stub_id);
10626     assert(entry_count == 1, "sanity check");
10627     address start = load_archive_data(stub_id);
10628     if (start != nullptr) {
10629       return start;
10630     }
10631     __ align(CodeEntryAlignment);
10632     StubCodeMark mark(this, stub_id);
10633     start = __ pc();
10634 
10635     Register src   = c_rarg0;  // source array
10636     Register soff  = c_rarg1;  // source start offset
10637     Register send  = c_rarg2;  // source end offset
10638     Register dst   = c_rarg3;  // dest array
10639     Register doff  = c_rarg4;  // position for writing to dest array
10640     Register isURL = c_rarg5;  // Base64 or URL character set
10641 
10642     // c_rarg6 and c_rarg7 are free to use as temps
10643     Register codec  = c_rarg6;
10644     Register length = c_rarg7;
10645 
10646     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
10647 
10648     __ add(src, src, soff);
10649     __ add(dst, dst, doff);
10650     __ sub(length, send, soff);
10651 
10652     // load the codec base address
10653     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
10654     __ cbz(isURL, ProcessData);
10655     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
10656 
10657     __ BIND(ProcessData);
10658 
10659     // too short to formup a SIMD loop, roll back
10660     __ cmp(length, (u1)24);
10661     __ br(Assembler::LT, Process3B);
10662 
10663     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
10664 
10665     __ BIND(Process48B);
10666     __ cmp(length, (u1)48);
10667     __ br(Assembler::LT, Process24B);
10668     generate_base64_encode_simdround(src, dst, v0, 16);
10669     __ sub(length, length, 48);
10670     __ b(Process48B);
10671 
10672     __ BIND(Process24B);
10673     __ cmp(length, (u1)24);
10674     __ br(Assembler::LT, SIMDExit);
10675     generate_base64_encode_simdround(src, dst, v0, 8);
10676     __ sub(length, length, 24);
10677 
10678     __ BIND(SIMDExit);
10679     __ cbz(length, Exit);
10680 
10681     __ BIND(Process3B);
10682     //  3 src bytes, 24 bits
10683     __ ldrb(r10, __ post(src, 1));
10684     __ ldrb(r11, __ post(src, 1));
10685     __ ldrb(r12, __ post(src, 1));
10686     __ orrw(r11, r11, r10, Assembler::LSL, 8);
10687     __ orrw(r12, r12, r11, Assembler::LSL, 8);
10688     // codec index
10689     __ ubfmw(r15, r12, 18, 23);
10690     __ ubfmw(r14, r12, 12, 17);
10691     __ ubfmw(r13, r12, 6,  11);
10692     __ andw(r12,  r12, 63);
10693     // get the code based on the codec
10694     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
10695     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
10696     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
10697     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
10698     __ strb(r15, __ post(dst, 1));
10699     __ strb(r14, __ post(dst, 1));
10700     __ strb(r13, __ post(dst, 1));
10701     __ strb(r12, __ post(dst, 1));
10702     __ sub(length, length, 3);
10703     __ cbnz(length, Process3B);
10704 
10705     __ BIND(Exit);
10706     __ ret(lr);
10707 
10708     // record the stub entry and end
10709     store_archive_data(stub_id, start, __ pc());
10710 
10711     return start;
10712   }
10713 
10714   void generate_base64_decode_simdround(Register src, Register dst,
10715         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
10716 
10717     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
10718     FloatRegister out0 = v20, out1 = v21, out2 = v22;
10719 
10720     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
10721     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
10722 
10723     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
10724 
10725     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10726 
10727     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10728 
10729     // we need unsigned saturating subtract, to make sure all input values
10730     // in range [0, 63] will have 0U value in the higher half lookup
10731     __ uqsubv(decH0, __ T16B, in0, v27);
10732     __ uqsubv(decH1, __ T16B, in1, v27);
10733     __ uqsubv(decH2, __ T16B, in2, v27);
10734     __ uqsubv(decH3, __ T16B, in3, v27);
10735 
10736     // lower half lookup
10737     __ tbl(decL0, arrangement, codecL, 4, in0);
10738     __ tbl(decL1, arrangement, codecL, 4, in1);
10739     __ tbl(decL2, arrangement, codecL, 4, in2);
10740     __ tbl(decL3, arrangement, codecL, 4, in3);
10741 
10742     // higher half lookup
10743     __ tbx(decH0, arrangement, codecH, 4, decH0);
10744     __ tbx(decH1, arrangement, codecH, 4, decH1);
10745     __ tbx(decH2, arrangement, codecH, 4, decH2);
10746     __ tbx(decH3, arrangement, codecH, 4, decH3);
10747 
10748     // combine lower and higher
10749     __ orr(decL0, arrangement, decL0, decH0);
10750     __ orr(decL1, arrangement, decL1, decH1);
10751     __ orr(decL2, arrangement, decL2, decH2);
10752     __ orr(decL3, arrangement, decL3, decH3);
10753 
10754     // check illegal inputs, value larger than 63 (maximum of 6 bits)
10755     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10756     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10757     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10758     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10759     __ orr(in0, arrangement, decH0, decH1);
10760     __ orr(in1, arrangement, decH2, decH3);
10761     __ orr(in2, arrangement, in0,   in1);
10762     __ umaxv(in3, arrangement, in2);
10763     __ umov(rscratch2, in3, __ B, 0);
10764 
10765     // get the data to output
10766     __ shl(out0,  arrangement, decL0, 2);
10767     __ ushr(out1, arrangement, decL1, 4);
10768     __ orr(out0,  arrangement, out0,  out1);
10769     __ shl(out1,  arrangement, decL1, 4);
10770     __ ushr(out2, arrangement, decL2, 2);
10771     __ orr(out1,  arrangement, out1,  out2);
10772     __ shl(out2,  arrangement, decL2, 6);
10773     __ orr(out2,  arrangement, out2,  decL3);
10774 
10775     __ cbz(rscratch2, NoIllegalData);
10776 
10777     // handle illegal input
10778     __ umov(r10, in2, __ D, 0);
10779     if (size == 16) {
10780       __ cbnz(r10, ErrorInLowerHalf);
10781 
10782       // illegal input is in higher half, store the lower half now.
10783       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10784 
10785       __ umov(r10, in2,  __ D, 1);
10786       __ umov(r11, out0, __ D, 1);
10787       __ umov(r12, out1, __ D, 1);
10788       __ umov(r13, out2, __ D, 1);
10789       __ b(StoreLegalData);
10790 
10791       __ BIND(ErrorInLowerHalf);
10792     }
10793     __ umov(r11, out0, __ D, 0);
10794     __ umov(r12, out1, __ D, 0);
10795     __ umov(r13, out2, __ D, 0);
10796 
10797     __ BIND(StoreLegalData);
10798     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10799     __ strb(r11, __ post(dst, 1));
10800     __ strb(r12, __ post(dst, 1));
10801     __ strb(r13, __ post(dst, 1));
10802     __ lsr(r10, r10, 8);
10803     __ lsr(r11, r11, 8);
10804     __ lsr(r12, r12, 8);
10805     __ lsr(r13, r13, 8);
10806     __ b(StoreLegalData);
10807 
10808     __ BIND(NoIllegalData);
10809     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10810   }
10811 
10812 
10813    /**
10814    *  Arguments:
10815    *
10816    *  Input:
10817    *  c_rarg0   - src_start
10818    *  c_rarg1   - src_offset
10819    *  c_rarg2   - src_length
10820    *  c_rarg3   - dest_start
10821    *  c_rarg4   - dest_offset
10822    *  c_rarg5   - isURL
10823    *  c_rarg6   - isMIME
10824    *
10825    */
10826   address generate_base64_decodeBlock() {
10827 
10828     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10829     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10830     // titled "Base64 decoding".
10831 
10832     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10833     int entry_count = StubInfo::entry_count(stub_id);
10834     assert(entry_count == 1, "sanity check");
10835     address start = load_archive_data(stub_id);
10836     if (start != nullptr) {
10837       return start;
10838     }
10839     __ align(CodeEntryAlignment);
10840     StubCodeMark mark(this, stub_id);
10841     start = __ pc();
10842 
10843     Register src    = c_rarg0;  // source array
10844     Register soff   = c_rarg1;  // source start offset
10845     Register send   = c_rarg2;  // source end offset
10846     Register dst    = c_rarg3;  // dest array
10847     Register doff   = c_rarg4;  // position for writing to dest array
10848     Register isURL  = c_rarg5;  // Base64 or URL character set
10849     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10850 
10851     Register length = send;    // reuse send as length of source data to process
10852 
10853     Register simd_codec   = c_rarg6;
10854     Register nosimd_codec = c_rarg7;
10855 
10856     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10857 
10858     __ enter();
10859 
10860     __ add(src, src, soff);
10861     __ add(dst, dst, doff);
10862 
10863     __ mov(doff, dst);
10864 
10865     __ sub(length, send, soff);
10866     __ bfm(length, zr, 0, 1);
10867 
10868     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
10869     __ cbz(isURL, ProcessData);
10870     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
10871 
10872     __ BIND(ProcessData);
10873     __ mov(rscratch1, length);
10874     __ cmp(length, (u1)144); // 144 = 80 + 64
10875     __ br(Assembler::LT, Process4B);
10876 
10877     // In the MIME case, the line length cannot be more than 76
10878     // bytes (see RFC 2045). This is too short a block for SIMD
10879     // to be worthwhile, so we use non-SIMD here.
10880     __ movw(rscratch1, 79);
10881 
10882     __ BIND(Process4B);
10883     __ ldrw(r14, __ post(src, 4));
10884     __ ubfxw(r10, r14, 0,  8);
10885     __ ubfxw(r11, r14, 8,  8);
10886     __ ubfxw(r12, r14, 16, 8);
10887     __ ubfxw(r13, r14, 24, 8);
10888     // get the de-code
10889     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10890     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10891     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10892     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10893     // error detection, 255u indicates an illegal input
10894     __ orrw(r14, r10, r11);
10895     __ orrw(r15, r12, r13);
10896     __ orrw(r14, r14, r15);
10897     __ tbnz(r14, 7, Exit);
10898     // recover the data
10899     __ lslw(r14, r10, 10);
10900     __ bfiw(r14, r11, 4, 6);
10901     __ bfmw(r14, r12, 2, 5);
10902     __ rev16w(r14, r14);
10903     __ bfiw(r13, r12, 6, 2);
10904     __ strh(r14, __ post(dst, 2));
10905     __ strb(r13, __ post(dst, 1));
10906     // non-simd loop
10907     __ subsw(rscratch1, rscratch1, 4);
10908     __ br(Assembler::GT, Process4B);
10909 
10910     // if exiting from PreProcess80B, rscratch1 == -1;
10911     // otherwise, rscratch1 == 0.
10912     __ cbzw(rscratch1, Exit);
10913     __ sub(length, length, 80);
10914 
10915     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
10916     __ cbz(isURL, SIMDEnter);
10917     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
10918 
10919     __ BIND(SIMDEnter);
10920     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10921     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10922     __ mov(rscratch1, 63);
10923     __ dup(v27, __ T16B, rscratch1);
10924 
10925     __ BIND(Process64B);
10926     __ cmp(length, (u1)64);
10927     __ br(Assembler::LT, Process32B);
10928     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10929     __ sub(length, length, 64);
10930     __ b(Process64B);
10931 
10932     __ BIND(Process32B);
10933     __ cmp(length, (u1)32);
10934     __ br(Assembler::LT, SIMDExit);
10935     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10936     __ sub(length, length, 32);
10937     __ b(Process32B);
10938 
10939     __ BIND(SIMDExit);
10940     __ cbz(length, Exit);
10941     __ movw(rscratch1, length);
10942     __ b(Process4B);
10943 
10944     __ BIND(Exit);
10945     __ sub(c_rarg0, dst, doff);
10946 
10947     __ leave();
10948     __ ret(lr);
10949 
10950     // record the stub entry and end
10951     store_archive_data(stub_id, start, __ pc());
10952 
10953     return start;
10954   }
10955 
10956   // Support for spin waits.
10957   address generate_spin_wait() {
10958     StubId stub_id = StubId::stubgen_spin_wait_id;
10959     int entry_count = StubInfo::entry_count(stub_id);
10960     assert(entry_count == 1, "sanity check");
10961     address start = load_archive_data(stub_id);
10962     if (start != nullptr) {
10963       return start;
10964     }
10965     __ align(CodeEntryAlignment);
10966     StubCodeMark mark(this, stub_id);
10967     start = __ pc();
10968 
10969     __ spin_wait();
10970     __ ret(lr);
10971 
10972     // record the stub entry and end
10973     store_archive_data(stub_id, start, __ pc());
10974 
10975     return start;
10976   }
10977 
10978   void generate_lookup_secondary_supers_table_stub() {
10979     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10980     GrowableArray<address> entries;
10981     int entry_count = StubInfo::entry_count(stub_id);
10982     assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
10983     address start = load_archive_data(stub_id, &entries);
10984     if (start != nullptr) {
10985       assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
10986              "unexpected extra entry count %d", entries.length());
10987       StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
10988       for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10989         StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
10990       }
10991       return;
10992     }
10993 
10994     StubCodeMark mark(this, stub_id);
10995 
10996     const Register
10997       r_super_klass  = r0,
10998       r_array_base   = r1,
10999       r_array_length = r2,
11000       r_array_index  = r3,
11001       r_sub_klass    = r4,
11002       r_bitmap       = rscratch2,
11003       result         = r5;
11004     const FloatRegister
11005       vtemp          = v0;
11006 
11007     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
11008       address next_entry = __ pc();
11009       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
11010       if (slot == 0) {
11011         start = next_entry;
11012       } else {
11013         entries.append(next_entry);
11014       }
11015       Label L_success;
11016       __ enter();
11017       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
11018                                              r_array_base, r_array_length, r_array_index,
11019                                              vtemp, result, slot,
11020                                              /*stub_is_near*/true);
11021       __ leave();
11022       __ ret(lr);
11023     }
11024     // record the stub entry and end plus all the auxiliary entries
11025     store_archive_data(stub_id, start, __ pc(), &entries);
11026   }
11027 
11028   // Slow path implementation for UseSecondarySupersTable.
11029   address generate_lookup_secondary_supers_table_slow_path_stub() {
11030     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
11031     int entry_count = StubInfo::entry_count(stub_id);
11032     assert(entry_count == 1, "sanity check");
11033     address start = load_archive_data(stub_id);
11034     if (start != nullptr) {
11035       return start;
11036     }
11037     StubCodeMark mark(this, stub_id);
11038     start = __ pc();
11039     const Register
11040       r_super_klass  = r0,        // argument
11041       r_array_base   = r1,        // argument
11042       temp1          = r2,        // temp
11043       r_array_index  = r3,        // argument
11044       r_bitmap       = rscratch2, // argument
11045       result         = r5;        // argument
11046 
11047     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
11048     __ ret(lr);
11049 
11050     // record the stub entry and end
11051     store_archive_data(stub_id, start, __ pc());
11052 
11053     return start;
11054   }
11055 
11056 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11057 
11058   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
11059   //
11060   // If LSE is in use, generate LSE versions of all the stubs. The
11061   // non-LSE versions are in atomic_aarch64.S.
11062 
11063   // class AtomicStubMark records the entry point of a stub and the
11064   // stub pointer which will point to it. The stub pointer is set to
11065   // the entry point when ~AtomicStubMark() is called, which must be
11066   // after ICache::invalidate_range. This ensures safe publication of
11067   // the generated code.
11068   class AtomicStubMark {
11069     address _entry_point;
11070     aarch64_atomic_stub_t *_stub;
11071     MacroAssembler *_masm;
11072   public:
11073     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
11074       _masm = masm;
11075       __ align(32);
11076       _entry_point = __ pc();
11077       _stub = stub;
11078     }
11079     ~AtomicStubMark() {
11080       *_stub = (aarch64_atomic_stub_t)_entry_point;
11081     }
11082   };
11083 
11084   // NB: For memory_order_conservative we need a trailing membar after
11085   // LSE atomic operations but not a leading membar.
11086   //
11087   // We don't need a leading membar because a clause in the Arm ARM
11088   // says:
11089   //
11090   //   Barrier-ordered-before
11091   //
11092   //   Barrier instructions order prior Memory effects before subsequent
11093   //   Memory effects generated by the same Observer. A read or a write
11094   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
11095   //   Observer if and only if RW1 appears in program order before RW 2
11096   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
11097   //   instruction with both Acquire and Release semantics.
11098   //
11099   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
11100   // and Release semantics, therefore we don't need a leading
11101   // barrier. However, there is no corresponding Barrier-ordered-after
11102   // relationship, therefore we need a trailing membar to prevent a
11103   // later store or load from being reordered with the store in an
11104   // atomic instruction.
11105   //
11106   // This was checked by using the herd7 consistency model simulator
11107   // (http://diy.inria.fr/) with this test case:
11108   //
11109   // AArch64 LseCas
11110   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
11111   // P0 | P1;
11112   // LDR W4, [X2] | MOV W3, #0;
11113   // DMB LD       | MOV W4, #1;
11114   // LDR W3, [X1] | CASAL W3, W4, [X1];
11115   //              | DMB ISH;
11116   //              | STR W4, [X2];
11117   // exists
11118   // (0:X3=0 /\ 0:X4=1)
11119   //
11120   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
11121   // with the store to x in P1. Without the DMB in P1 this may happen.
11122   //
11123   // At the time of writing we don't know of any AArch64 hardware that
11124   // reorders stores in this way, but the Reference Manual permits it.
11125 
11126   void gen_cas_entry(Assembler::operand_size size,
11127                      atomic_memory_order order) {
11128     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
11129       exchange_val = c_rarg2;
11130     bool acquire, release;
11131     switch (order) {
11132       case memory_order_relaxed:
11133         acquire = false;
11134         release = false;
11135         break;
11136       case memory_order_release:
11137         acquire = false;
11138         release = true;
11139         break;
11140       default:
11141         acquire = true;
11142         release = true;
11143         break;
11144     }
11145     __ mov(prev, compare_val);
11146     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
11147     if (order == memory_order_conservative) {
11148       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11149     }
11150     if (size == Assembler::xword) {
11151       __ mov(r0, prev);
11152     } else {
11153       __ movw(r0, prev);
11154     }
11155     __ ret(lr);
11156   }
11157 
11158   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
11159     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11160     // If not relaxed, then default to conservative.  Relaxed is the only
11161     // case we use enough to be worth specializing.
11162     if (order == memory_order_relaxed) {
11163       __ ldadd(size, incr, prev, addr);
11164     } else {
11165       __ ldaddal(size, incr, prev, addr);
11166       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11167     }
11168     if (size == Assembler::xword) {
11169       __ mov(r0, prev);
11170     } else {
11171       __ movw(r0, prev);
11172     }
11173     __ ret(lr);
11174   }
11175 
11176   void gen_swpal_entry(Assembler::operand_size size) {
11177     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11178     __ swpal(size, incr, prev, addr);
11179     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11180     if (size == Assembler::xword) {
11181       __ mov(r0, prev);
11182     } else {
11183       __ movw(r0, prev);
11184     }
11185     __ ret(lr);
11186   }
11187 
11188   void generate_atomic_entry_points() {
11189     if (! UseLSE) {
11190       return;
11191     }
11192     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
11193     GrowableArray<address> entries;
11194     int entry_count = StubInfo::entry_count(stub_id);
11195     address start = load_archive_data(stub_id, &entries);
11196     if (start != nullptr) {
11197       assert(entries.length() == entry_count - 1,
11198              "unexpected extra entry count %d", entries.length());
11199       aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
11200       int idx = 0;
11201       aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11202       aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11203       aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11204       aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11205       aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11206       aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11207       aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11208       aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11209       aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11210       aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11211       aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11212       aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11213       aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11214       aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11215       aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11216       assert(idx == entries.length(), "sanity!");
11217       return;
11218     }
11219 
11220     __ align(CodeEntryAlignment);
11221     StubCodeMark mark(this, stub_id);
11222     start = __ pc();
11223     address end;
11224     {
11225     // ADD, memory_order_conservative
11226     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
11227     gen_ldadd_entry(Assembler::word, memory_order_conservative);
11228 
11229     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
11230     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
11231 
11232     // ADD, memory_order_relaxed
11233     AtomicStubMark mark_fetch_add_4_relaxed
11234       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
11235     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
11236 
11237     AtomicStubMark mark_fetch_add_8_relaxed
11238       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
11239     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
11240 
11241     // XCHG, memory_order_conservative
11242     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
11243     gen_swpal_entry(Assembler::word);
11244 
11245     AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
11246     gen_swpal_entry(Assembler::xword);
11247 
11248     // CAS, memory_order_conservative
11249     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
11250     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
11251 
11252     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
11253     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
11254 
11255     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
11256     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
11257 
11258     // CAS, memory_order_relaxed
11259     AtomicStubMark mark_cmpxchg_1_relaxed
11260       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
11261     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
11262 
11263     AtomicStubMark mark_cmpxchg_4_relaxed
11264       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
11265     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
11266 
11267     AtomicStubMark mark_cmpxchg_8_relaxed
11268       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
11269     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
11270 
11271     AtomicStubMark mark_cmpxchg_4_release
11272       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
11273     gen_cas_entry(MacroAssembler::word, memory_order_release);
11274 
11275     AtomicStubMark mark_cmpxchg_8_release
11276       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
11277     gen_cas_entry(MacroAssembler::xword, memory_order_release);
11278 
11279     AtomicStubMark mark_cmpxchg_4_seq_cst
11280       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
11281     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
11282 
11283     AtomicStubMark mark_cmpxchg_8_seq_cst
11284       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
11285     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
11286 
11287     end = __ pc();
11288 
11289     ICache::invalidate_range(start, end - start);
11290     // exit block to force update of AtomicStubMark targets
11291     }
11292 
11293     assert(start == (address)aarch64_atomic_fetch_add_4_impl,
11294            "atomic stub should be at start of buffer");
11295     // record the stub start and end plus all the entries saved by the
11296     // AtomicStubMark destructor
11297     entries.append((address)aarch64_atomic_fetch_add_8_impl);
11298     entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
11299     entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
11300     entries.append((address)aarch64_atomic_xchg_4_impl);
11301     entries.append((address)aarch64_atomic_xchg_8_impl);
11302     entries.append((address)aarch64_atomic_cmpxchg_1_impl);
11303     entries.append((address)aarch64_atomic_cmpxchg_4_impl);
11304     entries.append((address)aarch64_atomic_cmpxchg_8_impl);
11305     entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
11306     entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
11307     entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
11308     entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
11309     entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
11310     entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
11311     entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
11312 
11313     assert(entries.length() == entry_count - 1,
11314            "unexpected extra entry count %d", entries.length());
11315 
11316     store_archive_data(stub_id, start, end, &entries);
11317   }
11318 #endif // LINUX
11319 
11320   static void save_return_registers(MacroAssembler* masm) {
11321     if (InlineTypeReturnedAsFields) {
11322       masm->push(RegSet::range(r0, r7), sp);
11323       masm->sub(sp, sp, 4 * wordSize);
11324       masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
11325       masm->sub(sp, sp, 4 * wordSize);
11326       masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
11327     } else {
11328       masm->fmovd(rscratch1, v0);
11329       masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
11330     }
11331   }
11332 
11333   static void restore_return_registers(MacroAssembler* masm) {
11334     if (InlineTypeReturnedAsFields) {
11335       masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
11336       masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
11337       masm->pop(RegSet::range(r0, r7), sp);
11338     } else {
11339       masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
11340       masm->fmovd(v0, rscratch1);
11341     }
11342   }
11343 
11344   address generate_cont_thaw(Continuation::thaw_kind kind) {
11345     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
11346     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
11347 
11348     address start = __ pc();
11349 
11350     if (return_barrier) {
11351       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
11352       __ mov(sp, rscratch1);
11353     }
11354     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11355 
11356     if (return_barrier) {
11357       // preserve possible return value from a method returning to the return barrier
11358       save_return_registers(_masm);
11359     }
11360 
11361     __ movw(c_rarg1, (return_barrier ? 1 : 0));
11362     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
11363     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
11364 
11365     if (return_barrier) {
11366       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11367       restore_return_registers(_masm);
11368     }
11369     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11370 
11371 
11372     Label thaw_success;
11373     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
11374     __ cbnz(rscratch2, thaw_success);
11375     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
11376     __ br(rscratch1);
11377     __ bind(thaw_success);
11378 
11379     // make room for the thawed frames
11380     __ sub(rscratch1, sp, rscratch2);
11381     __ andr(rscratch1, rscratch1, -16); // align
11382     __ mov(sp, rscratch1);
11383 
11384     if (return_barrier) {
11385       // save original return value -- again
11386       save_return_registers(_masm);
11387     }
11388 
11389     // If we want, we can templatize thaw by kind, and have three different entries
11390     __ movw(c_rarg1, (uint32_t)kind);
11391 
11392     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
11393     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
11394 
11395     if (return_barrier) {
11396       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11397       restore_return_registers(_masm);
11398     } else {
11399       __ mov(r0, zr); // return 0 (success) from doYield
11400     }
11401 
11402     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
11403     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
11404     __ mov(rfp, sp);
11405 
11406     if (return_barrier_exception) {
11407       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
11408       __ authenticate_return_address(c_rarg1);
11409       __ verify_oop(r0);
11410       // save return value containing the exception oop in callee-saved R19
11411       __ mov(r19, r0);
11412 
11413       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
11414 
11415       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
11416       // __ reinitialize_ptrue();
11417 
11418       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
11419 
11420       __ mov(r1, r0); // the exception handler
11421       __ mov(r0, r19); // restore return value containing the exception oop
11422       __ verify_oop(r0);
11423 
11424       __ leave();
11425       __ mov(r3, lr);
11426       __ br(r1); // the exception handler
11427     } else {
11428       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
11429       __ leave();
11430       __ ret(lr);
11431     }
11432 
11433     return start;
11434   }
11435 
11436   address generate_cont_thaw() {
11437     if (!Continuations::enabled()) return nullptr;
11438 
11439     StubId stub_id = StubId::stubgen_cont_thaw_id;
11440     int entry_count = StubInfo::entry_count(stub_id);
11441     assert(entry_count == 1, "sanity check");
11442     address start = load_archive_data(stub_id);
11443     if (start != nullptr) {
11444       return start;
11445     }
11446     StubCodeMark mark(this, stub_id);
11447     start = __ pc();
11448     generate_cont_thaw(Continuation::thaw_top);
11449 
11450     // record the stub start and end
11451     store_archive_data(stub_id, start, __ pc());
11452 
11453     return start;
11454   }
11455 
11456   address generate_cont_returnBarrier() {
11457     if (!Continuations::enabled()) return nullptr;
11458 
11459     // TODO: will probably need multiple return barriers depending on return type
11460     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
11461     int entry_count = StubInfo::entry_count(stub_id);
11462     assert(entry_count == 1, "sanity check");
11463     address start = load_archive_data(stub_id);
11464     if (start != nullptr) {
11465       return start;
11466     }
11467     StubCodeMark mark(this, stub_id);
11468     start = __ pc();
11469 
11470     generate_cont_thaw(Continuation::thaw_return_barrier);
11471 
11472     // record the stub start and end
11473     store_archive_data(stub_id, start, __ pc());
11474 
11475     return start;
11476   }
11477 
11478   address generate_cont_returnBarrier_exception() {
11479     if (!Continuations::enabled()) return nullptr;
11480 
11481     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
11482     int entry_count = StubInfo::entry_count(stub_id);
11483     assert(entry_count == 1, "sanity check");
11484     address start = load_archive_data(stub_id);
11485     if (start != nullptr) {
11486       return start;
11487     }
11488     StubCodeMark mark(this, stub_id);
11489     start = __ pc();
11490 
11491     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
11492 
11493     // record the stub start and end
11494     store_archive_data(stub_id, start, __ pc());
11495 
11496     return start;
11497   }
11498 
11499   address generate_cont_preempt_stub() {
11500     if (!Continuations::enabled()) return nullptr;
11501     StubId stub_id = StubId::stubgen_cont_preempt_id;
11502     int entry_count = StubInfo::entry_count(stub_id);
11503     assert(entry_count == 1, "sanity check");
11504     address start = load_archive_data(stub_id);
11505     if (start != nullptr) {
11506       return start;
11507     }
11508     StubCodeMark mark(this, stub_id);
11509     start = __ pc();
11510 
11511     __ reset_last_Java_frame(true);
11512 
11513     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
11514     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
11515     __ mov(sp, rscratch2);
11516 
11517     Label preemption_cancelled;
11518     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
11519     __ cbnz(rscratch1, preemption_cancelled);
11520 
11521     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
11522     SharedRuntime::continuation_enter_cleanup(_masm);
11523     __ leave();
11524     __ ret(lr);
11525 
11526     // We acquired the monitor after freezing the frames so call thaw to continue execution.
11527     __ bind(preemption_cancelled);
11528     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
11529     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
11530     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
11531     __ ldr(rscratch1, Address(rscratch1));
11532     __ br(rscratch1);
11533 
11534     // record the stub start and end
11535     store_archive_data(stub_id, start, __ pc());
11536 
11537     return start;
11538   }
11539 
11540   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
11541   // are represented as long[5], with BITS_PER_LIMB = 26.
11542   // Pack five 26-bit limbs into three 64-bit registers.
11543   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
11544     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
11545     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
11546     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
11547     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
11548 
11549     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
11550     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
11551     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
11552     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
11553 
11554     if (dest2->is_valid()) {
11555       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
11556     } else {
11557 #ifdef ASSERT
11558       Label OK;
11559       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
11560       __ br(__ EQ, OK);
11561       __ stop("high bits of Poly1305 integer should be zero");
11562       __ should_not_reach_here();
11563       __ bind(OK);
11564 #endif
11565     }
11566   }
11567 
11568   // As above, but return only a 128-bit integer, packed into two
11569   // 64-bit registers.
11570   void pack_26(Register dest0, Register dest1, Register src) {
11571     pack_26(dest0, dest1, noreg, src);
11572   }
11573 
11574   // Multiply and multiply-accumulate unsigned 64-bit registers.
11575   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
11576     __ mul(prod_lo, n, m);
11577     __ umulh(prod_hi, n, m);
11578   }
11579   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
11580     wide_mul(rscratch1, rscratch2, n, m);
11581     __ adds(sum_lo, sum_lo, rscratch1);
11582     __ adc(sum_hi, sum_hi, rscratch2);
11583   }
11584 
11585   // Poly1305, RFC 7539
11586 
11587   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
11588   // description of the tricks used to simplify and accelerate this
11589   // computation.
11590 
11591   address generate_poly1305_processBlocks() {
11592     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
11593     int entry_count = StubInfo::entry_count(stub_id);
11594     assert(entry_count == 1, "sanity check");
11595     address start = load_archive_data(stub_id);
11596     if (start != nullptr) {
11597       return start;
11598     }
11599     __ align(CodeEntryAlignment);
11600     StubCodeMark mark(this, stub_id);
11601     start = __ pc();
11602     Label here;
11603     __ enter();
11604     RegSet callee_saved = RegSet::range(r19, r28);
11605     __ push(callee_saved, sp);
11606 
11607     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
11608 
11609     // Arguments
11610     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
11611 
11612     // R_n is the 128-bit randomly-generated key, packed into two
11613     // registers.  The caller passes this key to us as long[5], with
11614     // BITS_PER_LIMB = 26.
11615     const Register R_0 = *++regs, R_1 = *++regs;
11616     pack_26(R_0, R_1, r_start);
11617 
11618     // RR_n is (R_n >> 2) * 5
11619     const Register RR_0 = *++regs, RR_1 = *++regs;
11620     __ lsr(RR_0, R_0, 2);
11621     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
11622     __ lsr(RR_1, R_1, 2);
11623     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
11624 
11625     // U_n is the current checksum
11626     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
11627     pack_26(U_0, U_1, U_2, acc_start);
11628 
11629     static constexpr int BLOCK_LENGTH = 16;
11630     Label DONE, LOOP;
11631 
11632     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11633     __ br(Assembler::LT, DONE); {
11634       __ bind(LOOP);
11635 
11636       // S_n is to be the sum of U_n and the next block of data
11637       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
11638       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
11639       __ adds(S_0, U_0, S_0);
11640       __ adcs(S_1, U_1, S_1);
11641       __ adc(S_2, U_2, zr);
11642       __ add(S_2, S_2, 1);
11643 
11644       const Register U_0HI = *++regs, U_1HI = *++regs;
11645 
11646       // NB: this logic depends on some of the special properties of
11647       // Poly1305 keys. In particular, because we know that the top
11648       // four bits of R_0 and R_1 are zero, we can add together
11649       // partial products without any risk of needing to propagate a
11650       // carry out.
11651       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
11652       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
11653       __ andr(U_2, R_0, 3);
11654       __ mul(U_2, S_2, U_2);
11655 
11656       // Recycle registers S_0, S_1, S_2
11657       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
11658 
11659       // Partial reduction mod 2**130 - 5
11660       __ adds(U_1, U_0HI, U_1);
11661       __ adc(U_2, U_1HI, U_2);
11662       // Sum now in U_2:U_1:U_0.
11663       // Dead: U_0HI, U_1HI.
11664       regs = (regs.remaining() + U_0HI + U_1HI).begin();
11665 
11666       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
11667 
11668       // First, U_2:U_1:U_0 += (U_2 >> 2)
11669       __ lsr(rscratch1, U_2, 2);
11670       __ andr(U_2, U_2, (u8)3);
11671       __ adds(U_0, U_0, rscratch1);
11672       __ adcs(U_1, U_1, zr);
11673       __ adc(U_2, U_2, zr);
11674       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
11675       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
11676       __ adcs(U_1, U_1, zr);
11677       __ adc(U_2, U_2, zr);
11678 
11679       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
11680       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11681       __ br(~ Assembler::LT, LOOP);
11682     }
11683 
11684     // Further reduce modulo 2^130 - 5
11685     __ lsr(rscratch1, U_2, 2);
11686     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
11687     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
11688     __ adcs(U_1, U_1, zr);
11689     __ andr(U_2, U_2, (u1)3);
11690     __ adc(U_2, U_2, zr);
11691 
11692     // Unpack the sum into five 26-bit limbs and write to memory.
11693     __ ubfiz(rscratch1, U_0, 0, 26);
11694     __ ubfx(rscratch2, U_0, 26, 26);
11695     __ stp(rscratch1, rscratch2, Address(acc_start));
11696     __ ubfx(rscratch1, U_0, 52, 12);
11697     __ bfi(rscratch1, U_1, 12, 14);
11698     __ ubfx(rscratch2, U_1, 14, 26);
11699     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
11700     __ ubfx(rscratch1, U_1, 40, 24);
11701     __ bfi(rscratch1, U_2, 24, 3);
11702     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
11703 
11704     __ bind(DONE);
11705     __ pop(callee_saved, sp);
11706     __ leave();
11707     __ ret(lr);
11708 
11709     // record the stub start and end
11710     store_archive_data(stub_id, start, __ pc());
11711 
11712     return start;
11713   }
11714 
11715   // exception handler for upcall stubs
11716   address generate_upcall_stub_exception_handler() {
11717     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
11718     int entry_count = StubInfo::entry_count(stub_id);
11719     assert(entry_count == 1, "sanity check");
11720     address start = load_archive_data(stub_id);
11721     if (start != nullptr) {
11722       return start;
11723     }
11724     StubCodeMark mark(this, stub_id);
11725     start = __ pc();
11726 
11727     // Native caller has no idea how to handle exceptions,
11728     // so we just crash here. Up to callee to catch exceptions.
11729     __ verify_oop(r0);
11730     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
11731     __ blr(rscratch1);
11732     __ should_not_reach_here();
11733 
11734     // record the stub start and end
11735     store_archive_data(stub_id, start, __ pc());
11736 
11737     return start;
11738   }
11739 
11740   // load Method* target of MethodHandle
11741   // j_rarg0 = jobject receiver
11742   // rmethod = result
11743   address generate_upcall_stub_load_target() {
11744     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
11745     int entry_count = StubInfo::entry_count(stub_id);
11746     assert(entry_count == 1, "sanity check");
11747     address start = load_archive_data(stub_id);
11748     if (start != nullptr) {
11749       return start;
11750     }
11751     StubCodeMark mark(this, stub_id);
11752     start = __ pc();
11753 
11754     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
11755       // Load target method from receiver
11756     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
11757     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
11758     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
11759     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
11760                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
11761                       noreg, noreg);
11762     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
11763 
11764     __ ret(lr);
11765 
11766     // record the stub start and end
11767     store_archive_data(stub_id, start, __ pc());
11768 
11769     return start;
11770   }
11771 
11772 #undef __
11773 #define __ masm->
11774 
11775   class MontgomeryMultiplyGenerator : public MacroAssembler {
11776 
11777     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
11778       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
11779 
11780     RegSet _toSave;
11781     bool _squaring;
11782 
11783   public:
11784     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
11785       : MacroAssembler(as->code()), _squaring(squaring) {
11786 
11787       // Register allocation
11788 
11789       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
11790       Pa_base = *regs;       // Argument registers
11791       if (squaring)
11792         Pb_base = Pa_base;
11793       else
11794         Pb_base = *++regs;
11795       Pn_base = *++regs;
11796       Rlen= *++regs;
11797       inv = *++regs;
11798       Pm_base = *++regs;
11799 
11800                           // Working registers:
11801       Ra =  *++regs;        // The current digit of a, b, n, and m.
11802       Rb =  *++regs;
11803       Rm =  *++regs;
11804       Rn =  *++regs;
11805 
11806       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
11807       Pb =  *++regs;
11808       Pm =  *++regs;
11809       Pn =  *++regs;
11810 
11811       t0 =  *++regs;        // Three registers which form a
11812       t1 =  *++regs;        // triple-precision accumuator.
11813       t2 =  *++regs;
11814 
11815       Ri =  *++regs;        // Inner and outer loop indexes.
11816       Rj =  *++regs;
11817 
11818       Rhi_ab = *++regs;     // Product registers: low and high parts
11819       Rlo_ab = *++regs;     // of a*b and m*n.
11820       Rhi_mn = *++regs;
11821       Rlo_mn = *++regs;
11822 
11823       // r19 and up are callee-saved.
11824       _toSave = RegSet::range(r19, *regs) + Pm_base;
11825     }
11826 
11827   private:
11828     void save_regs() {
11829       push(_toSave, sp);
11830     }
11831 
11832     void restore_regs() {
11833       pop(_toSave, sp);
11834     }
11835 
11836     template <typename T>
11837     void unroll_2(Register count, T block) {
11838       Label loop, end, odd;
11839       tbnz(count, 0, odd);
11840       cbz(count, end);
11841       align(16);
11842       bind(loop);
11843       (this->*block)();
11844       bind(odd);
11845       (this->*block)();
11846       subs(count, count, 2);
11847       br(Assembler::GT, loop);
11848       bind(end);
11849     }
11850 
11851     template <typename T>
11852     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11853       Label loop, end, odd;
11854       tbnz(count, 0, odd);
11855       cbz(count, end);
11856       align(16);
11857       bind(loop);
11858       (this->*block)(d, s, tmp);
11859       bind(odd);
11860       (this->*block)(d, s, tmp);
11861       subs(count, count, 2);
11862       br(Assembler::GT, loop);
11863       bind(end);
11864     }
11865 
11866     void pre1(RegisterOrConstant i) {
11867       block_comment("pre1");
11868       // Pa = Pa_base;
11869       // Pb = Pb_base + i;
11870       // Pm = Pm_base;
11871       // Pn = Pn_base + i;
11872       // Ra = *Pa;
11873       // Rb = *Pb;
11874       // Rm = *Pm;
11875       // Rn = *Pn;
11876       ldr(Ra, Address(Pa_base));
11877       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11878       ldr(Rm, Address(Pm_base));
11879       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11880       lea(Pa, Address(Pa_base));
11881       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11882       lea(Pm, Address(Pm_base));
11883       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11884 
11885       // Zero the m*n result.
11886       mov(Rhi_mn, zr);
11887       mov(Rlo_mn, zr);
11888     }
11889 
11890     // The core multiply-accumulate step of a Montgomery
11891     // multiplication.  The idea is to schedule operations as a
11892     // pipeline so that instructions with long latencies (loads and
11893     // multiplies) have time to complete before their results are
11894     // used.  This most benefits in-order implementations of the
11895     // architecture but out-of-order ones also benefit.
11896     void step() {
11897       block_comment("step");
11898       // MACC(Ra, Rb, t0, t1, t2);
11899       // Ra = *++Pa;
11900       // Rb = *--Pb;
11901       umulh(Rhi_ab, Ra, Rb);
11902       mul(Rlo_ab, Ra, Rb);
11903       ldr(Ra, pre(Pa, wordSize));
11904       ldr(Rb, pre(Pb, -wordSize));
11905       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11906                                        // previous iteration.
11907       // MACC(Rm, Rn, t0, t1, t2);
11908       // Rm = *++Pm;
11909       // Rn = *--Pn;
11910       umulh(Rhi_mn, Rm, Rn);
11911       mul(Rlo_mn, Rm, Rn);
11912       ldr(Rm, pre(Pm, wordSize));
11913       ldr(Rn, pre(Pn, -wordSize));
11914       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11915     }
11916 
11917     void post1() {
11918       block_comment("post1");
11919 
11920       // MACC(Ra, Rb, t0, t1, t2);
11921       // Ra = *++Pa;
11922       // Rb = *--Pb;
11923       umulh(Rhi_ab, Ra, Rb);
11924       mul(Rlo_ab, Ra, Rb);
11925       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11926       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11927 
11928       // *Pm = Rm = t0 * inv;
11929       mul(Rm, t0, inv);
11930       str(Rm, Address(Pm));
11931 
11932       // MACC(Rm, Rn, t0, t1, t2);
11933       // t0 = t1; t1 = t2; t2 = 0;
11934       umulh(Rhi_mn, Rm, Rn);
11935 
11936 #ifndef PRODUCT
11937       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11938       {
11939         mul(Rlo_mn, Rm, Rn);
11940         add(Rlo_mn, t0, Rlo_mn);
11941         Label ok;
11942         cbz(Rlo_mn, ok); {
11943           stop("broken Montgomery multiply");
11944         } bind(ok);
11945       }
11946 #endif
11947       // We have very carefully set things up so that
11948       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11949       // the lower half of Rm * Rn because we know the result already:
11950       // it must be -t0.  t0 + (-t0) must generate a carry iff
11951       // t0 != 0.  So, rather than do a mul and an adds we just set
11952       // the carry flag iff t0 is nonzero.
11953       //
11954       // mul(Rlo_mn, Rm, Rn);
11955       // adds(zr, t0, Rlo_mn);
11956       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11957       adcs(t0, t1, Rhi_mn);
11958       adc(t1, t2, zr);
11959       mov(t2, zr);
11960     }
11961 
11962     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11963       block_comment("pre2");
11964       // Pa = Pa_base + i-len;
11965       // Pb = Pb_base + len;
11966       // Pm = Pm_base + i-len;
11967       // Pn = Pn_base + len;
11968 
11969       if (i.is_register()) {
11970         sub(Rj, i.as_register(), len);
11971       } else {
11972         mov(Rj, i.as_constant());
11973         sub(Rj, Rj, len);
11974       }
11975       // Rj == i-len
11976 
11977       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11978       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11979       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11980       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11981 
11982       // Ra = *++Pa;
11983       // Rb = *--Pb;
11984       // Rm = *++Pm;
11985       // Rn = *--Pn;
11986       ldr(Ra, pre(Pa, wordSize));
11987       ldr(Rb, pre(Pb, -wordSize));
11988       ldr(Rm, pre(Pm, wordSize));
11989       ldr(Rn, pre(Pn, -wordSize));
11990 
11991       mov(Rhi_mn, zr);
11992       mov(Rlo_mn, zr);
11993     }
11994 
11995     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11996       block_comment("post2");
11997       if (i.is_constant()) {
11998         mov(Rj, i.as_constant()-len.as_constant());
11999       } else {
12000         sub(Rj, i.as_register(), len);
12001       }
12002 
12003       adds(t0, t0, Rlo_mn); // The pending m*n, low part
12004 
12005       // As soon as we know the least significant digit of our result,
12006       // store it.
12007       // Pm_base[i-len] = t0;
12008       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
12009 
12010       // t0 = t1; t1 = t2; t2 = 0;
12011       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
12012       adc(t1, t2, zr);
12013       mov(t2, zr);
12014     }
12015 
12016     // A carry in t0 after Montgomery multiplication means that we
12017     // should subtract multiples of n from our result in m.  We'll
12018     // keep doing that until there is no carry.
12019     void normalize(RegisterOrConstant len) {
12020       block_comment("normalize");
12021       // while (t0)
12022       //   t0 = sub(Pm_base, Pn_base, t0, len);
12023       Label loop, post, again;
12024       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
12025       cbz(t0, post); {
12026         bind(again); {
12027           mov(i, zr);
12028           mov(cnt, len);
12029           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12030           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12031           subs(zr, zr, zr); // set carry flag, i.e. no borrow
12032           align(16);
12033           bind(loop); {
12034             sbcs(Rm, Rm, Rn);
12035             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12036             add(i, i, 1);
12037             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12038             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12039             sub(cnt, cnt, 1);
12040           } cbnz(cnt, loop);
12041           sbc(t0, t0, zr);
12042         } cbnz(t0, again);
12043       } bind(post);
12044     }
12045 
12046     // Move memory at s to d, reversing words.
12047     //    Increments d to end of copied memory
12048     //    Destroys tmp1, tmp2
12049     //    Preserves len
12050     //    Leaves s pointing to the address which was in d at start
12051     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
12052       assert(tmp1->encoding() < r19->encoding(), "register corruption");
12053       assert(tmp2->encoding() < r19->encoding(), "register corruption");
12054 
12055       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
12056       mov(tmp1, len);
12057       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
12058       sub(s, d, len, ext::uxtw, LogBytesPerWord);
12059     }
12060     // where
12061     void reverse1(Register d, Register s, Register tmp) {
12062       ldr(tmp, pre(s, -wordSize));
12063       ror(tmp, tmp, 32);
12064       str(tmp, post(d, wordSize));
12065     }
12066 
12067     void step_squaring() {
12068       // An extra ACC
12069       step();
12070       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12071     }
12072 
12073     void last_squaring(RegisterOrConstant i) {
12074       Label dont;
12075       // if ((i & 1) == 0) {
12076       tbnz(i.as_register(), 0, dont); {
12077         // MACC(Ra, Rb, t0, t1, t2);
12078         // Ra = *++Pa;
12079         // Rb = *--Pb;
12080         umulh(Rhi_ab, Ra, Rb);
12081         mul(Rlo_ab, Ra, Rb);
12082         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12083       } bind(dont);
12084     }
12085 
12086     void extra_step_squaring() {
12087       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
12088 
12089       // MACC(Rm, Rn, t0, t1, t2);
12090       // Rm = *++Pm;
12091       // Rn = *--Pn;
12092       umulh(Rhi_mn, Rm, Rn);
12093       mul(Rlo_mn, Rm, Rn);
12094       ldr(Rm, pre(Pm, wordSize));
12095       ldr(Rn, pre(Pn, -wordSize));
12096     }
12097 
12098     void post1_squaring() {
12099       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
12100 
12101       // *Pm = Rm = t0 * inv;
12102       mul(Rm, t0, inv);
12103       str(Rm, Address(Pm));
12104 
12105       // MACC(Rm, Rn, t0, t1, t2);
12106       // t0 = t1; t1 = t2; t2 = 0;
12107       umulh(Rhi_mn, Rm, Rn);
12108 
12109 #ifndef PRODUCT
12110       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12111       {
12112         mul(Rlo_mn, Rm, Rn);
12113         add(Rlo_mn, t0, Rlo_mn);
12114         Label ok;
12115         cbz(Rlo_mn, ok); {
12116           stop("broken Montgomery multiply");
12117         } bind(ok);
12118       }
12119 #endif
12120       // We have very carefully set things up so that
12121       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
12122       // the lower half of Rm * Rn because we know the result already:
12123       // it must be -t0.  t0 + (-t0) must generate a carry iff
12124       // t0 != 0.  So, rather than do a mul and an adds we just set
12125       // the carry flag iff t0 is nonzero.
12126       //
12127       // mul(Rlo_mn, Rm, Rn);
12128       // adds(zr, t0, Rlo_mn);
12129       subs(zr, t0, 1); // Set carry iff t0 is nonzero
12130       adcs(t0, t1, Rhi_mn);
12131       adc(t1, t2, zr);
12132       mov(t2, zr);
12133     }
12134 
12135     void acc(Register Rhi, Register Rlo,
12136              Register t0, Register t1, Register t2) {
12137       adds(t0, t0, Rlo);
12138       adcs(t1, t1, Rhi);
12139       adc(t2, t2, zr);
12140     }
12141 
12142   public:
12143     /**
12144      * Fast Montgomery multiplication.  The derivation of the
12145      * algorithm is in A Cryptographic Library for the Motorola
12146      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
12147      *
12148      * Arguments:
12149      *
12150      * Inputs for multiplication:
12151      *   c_rarg0   - int array elements a
12152      *   c_rarg1   - int array elements b
12153      *   c_rarg2   - int array elements n (the modulus)
12154      *   c_rarg3   - int length
12155      *   c_rarg4   - int inv
12156      *   c_rarg5   - int array elements m (the result)
12157      *
12158      * Inputs for squaring:
12159      *   c_rarg0   - int array elements a
12160      *   c_rarg1   - int array elements n (the modulus)
12161      *   c_rarg2   - int length
12162      *   c_rarg3   - int inv
12163      *   c_rarg4   - int array elements m (the result)
12164      *
12165      */
12166     address generate_multiply() {
12167       Label argh, nothing;
12168 
12169       align(CodeEntryAlignment);
12170       address entry = pc();
12171 
12172       cbzw(Rlen, nothing);
12173 
12174       enter();
12175 
12176       // Make room.
12177       cmpw(Rlen, 512);
12178       br(Assembler::HI, argh);
12179       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12180       andr(sp, Ra, -2 * wordSize);
12181 
12182       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
12183 
12184       {
12185         // Copy input args, reversing as we go.  We use Ra as a
12186         // temporary variable.
12187         reverse(Ra, Pa_base, Rlen, t0, t1);
12188         if (!_squaring)
12189           reverse(Ra, Pb_base, Rlen, t0, t1);
12190         reverse(Ra, Pn_base, Rlen, t0, t1);
12191       }
12192 
12193       // Push all call-saved registers and also Pm_base which we'll need
12194       // at the end.
12195       save_regs();
12196 
12197 #ifndef PRODUCT
12198       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
12199       {
12200         ldr(Rn, Address(Pn_base, 0));
12201         mul(Rlo_mn, Rn, inv);
12202         subs(zr, Rlo_mn, -1);
12203         Label ok;
12204         br(EQ, ok); {
12205           stop("broken inverse in Montgomery multiply");
12206         } bind(ok);
12207       }
12208 #endif
12209 
12210       mov(Pm_base, Ra);
12211 
12212       mov(t0, zr);
12213       mov(t1, zr);
12214       mov(t2, zr);
12215 
12216       block_comment("for (int i = 0; i < len; i++) {");
12217       mov(Ri, zr); {
12218         Label loop, end;
12219         cmpw(Ri, Rlen);
12220         br(Assembler::GE, end);
12221 
12222         bind(loop);
12223         pre1(Ri);
12224 
12225         block_comment("  for (j = i; j; j--) {"); {
12226           movw(Rj, Ri);
12227           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12228         } block_comment("  } // j");
12229 
12230         post1();
12231         addw(Ri, Ri, 1);
12232         cmpw(Ri, Rlen);
12233         br(Assembler::LT, loop);
12234         bind(end);
12235         block_comment("} // i");
12236       }
12237 
12238       block_comment("for (int i = len; i < 2*len; i++) {");
12239       mov(Ri, Rlen); {
12240         Label loop, end;
12241         cmpw(Ri, Rlen, Assembler::LSL, 1);
12242         br(Assembler::GE, end);
12243 
12244         bind(loop);
12245         pre2(Ri, Rlen);
12246 
12247         block_comment("  for (j = len*2-i-1; j; j--) {"); {
12248           lslw(Rj, Rlen, 1);
12249           subw(Rj, Rj, Ri);
12250           subw(Rj, Rj, 1);
12251           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12252         } block_comment("  } // j");
12253 
12254         post2(Ri, Rlen);
12255         addw(Ri, Ri, 1);
12256         cmpw(Ri, Rlen, Assembler::LSL, 1);
12257         br(Assembler::LT, loop);
12258         bind(end);
12259       }
12260       block_comment("} // i");
12261 
12262       normalize(Rlen);
12263 
12264       mov(Ra, Pm_base);  // Save Pm_base in Ra
12265       restore_regs();  // Restore caller's Pm_base
12266 
12267       // Copy our result into caller's Pm_base
12268       reverse(Pm_base, Ra, Rlen, t0, t1);
12269 
12270       leave();
12271       bind(nothing);
12272       ret(lr);
12273 
12274       // handler for error case
12275       bind(argh);
12276       stop("MontgomeryMultiply total_allocation must be <= 8192");
12277 
12278       return entry;
12279     }
12280     // In C, approximately:
12281 
12282     // void
12283     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
12284     //                     julong Pn_base[], julong Pm_base[],
12285     //                     julong inv, int len) {
12286     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12287     //   julong *Pa, *Pb, *Pn, *Pm;
12288     //   julong Ra, Rb, Rn, Rm;
12289 
12290     //   int i;
12291 
12292     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12293 
12294     //   for (i = 0; i < len; i++) {
12295     //     int j;
12296 
12297     //     Pa = Pa_base;
12298     //     Pb = Pb_base + i;
12299     //     Pm = Pm_base;
12300     //     Pn = Pn_base + i;
12301 
12302     //     Ra = *Pa;
12303     //     Rb = *Pb;
12304     //     Rm = *Pm;
12305     //     Rn = *Pn;
12306 
12307     //     int iters = i;
12308     //     for (j = 0; iters--; j++) {
12309     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12310     //       MACC(Ra, Rb, t0, t1, t2);
12311     //       Ra = *++Pa;
12312     //       Rb = *--Pb;
12313     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12314     //       MACC(Rm, Rn, t0, t1, t2);
12315     //       Rm = *++Pm;
12316     //       Rn = *--Pn;
12317     //     }
12318 
12319     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
12320     //     MACC(Ra, Rb, t0, t1, t2);
12321     //     *Pm = Rm = t0 * inv;
12322     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12323     //     MACC(Rm, Rn, t0, t1, t2);
12324 
12325     //     assert(t0 == 0, "broken Montgomery multiply");
12326 
12327     //     t0 = t1; t1 = t2; t2 = 0;
12328     //   }
12329 
12330     //   for (i = len; i < 2*len; i++) {
12331     //     int j;
12332 
12333     //     Pa = Pa_base + i-len;
12334     //     Pb = Pb_base + len;
12335     //     Pm = Pm_base + i-len;
12336     //     Pn = Pn_base + len;
12337 
12338     //     Ra = *++Pa;
12339     //     Rb = *--Pb;
12340     //     Rm = *++Pm;
12341     //     Rn = *--Pn;
12342 
12343     //     int iters = len*2-i-1;
12344     //     for (j = i-len+1; iters--; j++) {
12345     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12346     //       MACC(Ra, Rb, t0, t1, t2);
12347     //       Ra = *++Pa;
12348     //       Rb = *--Pb;
12349     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12350     //       MACC(Rm, Rn, t0, t1, t2);
12351     //       Rm = *++Pm;
12352     //       Rn = *--Pn;
12353     //     }
12354 
12355     //     Pm_base[i-len] = t0;
12356     //     t0 = t1; t1 = t2; t2 = 0;
12357     //   }
12358 
12359     //   while (t0)
12360     //     t0 = sub(Pm_base, Pn_base, t0, len);
12361     // }
12362 
12363     /**
12364      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
12365      * multiplies than Montgomery multiplication so it should be up to
12366      * 25% faster.  However, its loop control is more complex and it
12367      * may actually run slower on some machines.
12368      *
12369      * Arguments:
12370      *
12371      * Inputs:
12372      *   c_rarg0   - int array elements a
12373      *   c_rarg1   - int array elements n (the modulus)
12374      *   c_rarg2   - int length
12375      *   c_rarg3   - int inv
12376      *   c_rarg4   - int array elements m (the result)
12377      *
12378      */
12379     address generate_square() {
12380       Label argh;
12381 
12382       align(CodeEntryAlignment);
12383       address entry = pc();
12384 
12385       enter();
12386 
12387       // Make room.
12388       cmpw(Rlen, 512);
12389       br(Assembler::HI, argh);
12390       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12391       andr(sp, Ra, -2 * wordSize);
12392 
12393       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
12394 
12395       {
12396         // Copy input args, reversing as we go.  We use Ra as a
12397         // temporary variable.
12398         reverse(Ra, Pa_base, Rlen, t0, t1);
12399         reverse(Ra, Pn_base, Rlen, t0, t1);
12400       }
12401 
12402       // Push all call-saved registers and also Pm_base which we'll need
12403       // at the end.
12404       save_regs();
12405 
12406       mov(Pm_base, Ra);
12407 
12408       mov(t0, zr);
12409       mov(t1, zr);
12410       mov(t2, zr);
12411 
12412       block_comment("for (int i = 0; i < len; i++) {");
12413       mov(Ri, zr); {
12414         Label loop, end;
12415         bind(loop);
12416         cmp(Ri, Rlen);
12417         br(Assembler::GE, end);
12418 
12419         pre1(Ri);
12420 
12421         block_comment("for (j = (i+1)/2; j; j--) {"); {
12422           add(Rj, Ri, 1);
12423           lsr(Rj, Rj, 1);
12424           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12425         } block_comment("  } // j");
12426 
12427         last_squaring(Ri);
12428 
12429         block_comment("  for (j = i/2; j; j--) {"); {
12430           lsr(Rj, Ri, 1);
12431           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12432         } block_comment("  } // j");
12433 
12434         post1_squaring();
12435         add(Ri, Ri, 1);
12436         cmp(Ri, Rlen);
12437         br(Assembler::LT, loop);
12438 
12439         bind(end);
12440         block_comment("} // i");
12441       }
12442 
12443       block_comment("for (int i = len; i < 2*len; i++) {");
12444       mov(Ri, Rlen); {
12445         Label loop, end;
12446         bind(loop);
12447         cmp(Ri, Rlen, Assembler::LSL, 1);
12448         br(Assembler::GE, end);
12449 
12450         pre2(Ri, Rlen);
12451 
12452         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
12453           lsl(Rj, Rlen, 1);
12454           sub(Rj, Rj, Ri);
12455           sub(Rj, Rj, 1);
12456           lsr(Rj, Rj, 1);
12457           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12458         } block_comment("  } // j");
12459 
12460         last_squaring(Ri);
12461 
12462         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
12463           lsl(Rj, Rlen, 1);
12464           sub(Rj, Rj, Ri);
12465           lsr(Rj, Rj, 1);
12466           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12467         } block_comment("  } // j");
12468 
12469         post2(Ri, Rlen);
12470         add(Ri, Ri, 1);
12471         cmp(Ri, Rlen, Assembler::LSL, 1);
12472 
12473         br(Assembler::LT, loop);
12474         bind(end);
12475         block_comment("} // i");
12476       }
12477 
12478       normalize(Rlen);
12479 
12480       mov(Ra, Pm_base);  // Save Pm_base in Ra
12481       restore_regs();  // Restore caller's Pm_base
12482 
12483       // Copy our result into caller's Pm_base
12484       reverse(Pm_base, Ra, Rlen, t0, t1);
12485 
12486       leave();
12487       ret(lr);
12488 
12489       // handler for error case
12490       bind(argh);
12491       stop("MontgomeryMultiply total_allocation must be <= 8192");
12492 
12493       return entry;
12494     }
12495     // In C, approximately:
12496 
12497     // void
12498     // montgomery_square(julong Pa_base[], julong Pn_base[],
12499     //                   julong Pm_base[], julong inv, int len) {
12500     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12501     //   julong *Pa, *Pb, *Pn, *Pm;
12502     //   julong Ra, Rb, Rn, Rm;
12503 
12504     //   int i;
12505 
12506     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12507 
12508     //   for (i = 0; i < len; i++) {
12509     //     int j;
12510 
12511     //     Pa = Pa_base;
12512     //     Pb = Pa_base + i;
12513     //     Pm = Pm_base;
12514     //     Pn = Pn_base + i;
12515 
12516     //     Ra = *Pa;
12517     //     Rb = *Pb;
12518     //     Rm = *Pm;
12519     //     Rn = *Pn;
12520 
12521     //     int iters = (i+1)/2;
12522     //     for (j = 0; iters--; j++) {
12523     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12524     //       MACC2(Ra, Rb, t0, t1, t2);
12525     //       Ra = *++Pa;
12526     //       Rb = *--Pb;
12527     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12528     //       MACC(Rm, Rn, t0, t1, t2);
12529     //       Rm = *++Pm;
12530     //       Rn = *--Pn;
12531     //     }
12532     //     if ((i & 1) == 0) {
12533     //       assert(Ra == Pa_base[j], "must be");
12534     //       MACC(Ra, Ra, t0, t1, t2);
12535     //     }
12536     //     iters = i/2;
12537     //     assert(iters == i-j, "must be");
12538     //     for (; iters--; j++) {
12539     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12540     //       MACC(Rm, Rn, t0, t1, t2);
12541     //       Rm = *++Pm;
12542     //       Rn = *--Pn;
12543     //     }
12544 
12545     //     *Pm = Rm = t0 * inv;
12546     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12547     //     MACC(Rm, Rn, t0, t1, t2);
12548 
12549     //     assert(t0 == 0, "broken Montgomery multiply");
12550 
12551     //     t0 = t1; t1 = t2; t2 = 0;
12552     //   }
12553 
12554     //   for (i = len; i < 2*len; i++) {
12555     //     int start = i-len+1;
12556     //     int end = start + (len - start)/2;
12557     //     int j;
12558 
12559     //     Pa = Pa_base + i-len;
12560     //     Pb = Pa_base + len;
12561     //     Pm = Pm_base + i-len;
12562     //     Pn = Pn_base + len;
12563 
12564     //     Ra = *++Pa;
12565     //     Rb = *--Pb;
12566     //     Rm = *++Pm;
12567     //     Rn = *--Pn;
12568 
12569     //     int iters = (2*len-i-1)/2;
12570     //     assert(iters == end-start, "must be");
12571     //     for (j = start; iters--; j++) {
12572     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12573     //       MACC2(Ra, Rb, t0, t1, t2);
12574     //       Ra = *++Pa;
12575     //       Rb = *--Pb;
12576     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12577     //       MACC(Rm, Rn, t0, t1, t2);
12578     //       Rm = *++Pm;
12579     //       Rn = *--Pn;
12580     //     }
12581     //     if ((i & 1) == 0) {
12582     //       assert(Ra == Pa_base[j], "must be");
12583     //       MACC(Ra, Ra, t0, t1, t2);
12584     //     }
12585     //     iters =  (2*len-i)/2;
12586     //     assert(iters == len-j, "must be");
12587     //     for (; iters--; j++) {
12588     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12589     //       MACC(Rm, Rn, t0, t1, t2);
12590     //       Rm = *++Pm;
12591     //       Rn = *--Pn;
12592     //     }
12593     //     Pm_base[i-len] = t0;
12594     //     t0 = t1; t1 = t2; t2 = 0;
12595     //   }
12596 
12597     //   while (t0)
12598     //     t0 = sub(Pm_base, Pn_base, t0, len);
12599     // }
12600   };
12601 
12602   // Call here from the interpreter or compiled code to either load
12603   // multiple returned values from the inline type instance being
12604   // returned to registers or to store returned values to a newly
12605   // allocated inline type instance.
12606   address generate_return_value_stub(address destination, const char* name, bool has_res) {
12607     // We need to save all registers the calling convention may use so
12608     // the runtime calls read or update those registers. This needs to
12609     // be in sync with SharedRuntime::java_return_convention().
12610     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
12611     enum layout {
12612       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
12613       j_rarg6_off, j_rarg6_2,
12614       j_rarg5_off, j_rarg5_2,
12615       j_rarg4_off, j_rarg4_2,
12616       j_rarg3_off, j_rarg3_2,
12617       j_rarg2_off, j_rarg2_2,
12618       j_rarg1_off, j_rarg1_2,
12619       j_rarg0_off, j_rarg0_2,
12620 
12621       j_farg7_off, j_farg7_2,
12622       j_farg6_off, j_farg6_2,
12623       j_farg5_off, j_farg5_2,
12624       j_farg4_off, j_farg4_2,
12625       j_farg3_off, j_farg3_2,
12626       j_farg2_off, j_farg2_2,
12627       j_farg1_off, j_farg1_2,
12628       j_farg0_off, j_farg0_2,
12629 
12630       rfp_off, rfp_off2,
12631       return_off, return_off2,
12632 
12633       framesize // inclusive of return address
12634     };
12635 
12636     CodeBuffer code(name, 512, 64);
12637     MacroAssembler* masm = new MacroAssembler(&code);
12638 
12639     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
12640     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
12641     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
12642     int frame_size_in_words = frame_size_in_bytes / wordSize;
12643 
12644     OopMapSet* oop_maps = new OopMapSet();
12645     OopMap* map = new OopMap(frame_size_in_slots, 0);
12646 
12647     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
12648     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
12649     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
12650     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
12651     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
12652     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
12653     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
12654     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
12655 
12656     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
12657     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
12658     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
12659     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
12660     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
12661     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
12662     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
12663     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
12664 
12665     address start = __ pc();
12666 
12667     __ enter(); // Save FP and LR before call
12668 
12669     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
12670     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
12671     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
12672     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
12673 
12674     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
12675     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
12676     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
12677     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
12678 
12679     int frame_complete = __ offset();
12680 
12681     // Set up last_Java_sp and last_Java_fp
12682     address the_pc = __ pc();
12683     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
12684 
12685     // Call runtime
12686     __ mov(c_rarg1, r0);
12687     __ mov(c_rarg0, rthread);
12688 
12689     __ mov(rscratch1, destination);
12690     __ blr(rscratch1);
12691 
12692     oop_maps->add_gc_map(the_pc - start, map);
12693 
12694     __ reset_last_Java_frame(false);
12695 
12696     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
12697     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
12698     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
12699     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
12700 
12701     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
12702     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
12703     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
12704     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
12705 
12706     // check for pending exceptions
12707     Label pending;
12708     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
12709     __ cbnz(rscratch1, pending);
12710 
12711     if (has_res) {
12712       // We just called SharedRuntime::store_inline_type_fields_to_buf. Check if we still
12713       // need to initialize the buffer and if so, call the inline class specific pack handler.
12714       Label skip_pack;
12715       __ get_vm_result_oop(r0, rthread);
12716       __ get_vm_result_metadata(rscratch1, rthread);
12717       __ cbz(rscratch1, skip_pack);
12718       __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
12719       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_offset()));
12720       __ blr(rscratch1);
12721       __ membar(Assembler::StoreStore);
12722       __ bind(skip_pack);
12723     }
12724 
12725     __ leave();
12726     __ ret(lr);
12727 
12728     __ bind(pending);
12729     __ leave();
12730     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
12731 
12732     // -------------
12733     // make sure all code is generated
12734     masm->flush();
12735 
12736     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
12737     return stub->entry_point();
12738   }
12739 
12740   // Initialization
12741   void generate_preuniverse_stubs() {
12742     // preuniverse stubs are not needed for aarch64
12743   }
12744 
12745   void generate_initial_stubs() {
12746     // Generate initial stubs and initializes the entry points
12747 
12748     // entry points that exist in all platforms Note: This is code
12749     // that could be shared among different platforms - however the
12750     // benefit seems to be smaller than the disadvantage of having a
12751     // much more complicated generator structure. See also comment in
12752     // stubRoutines.hpp.
12753 
12754     StubRoutines::_forward_exception_entry = generate_forward_exception();
12755 
12756     StubRoutines::_call_stub_entry =
12757       generate_call_stub(StubRoutines::_call_stub_return_address);
12758 
12759     // is referenced by megamorphic call
12760     StubRoutines::_catch_exception_entry = generate_catch_exception();
12761 
12762     // Initialize table for copy memory (arraycopy) check.
12763     if (UnsafeMemoryAccess::_table == nullptr) {
12764       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
12765     }
12766 
12767     if (UseCRC32Intrinsics) {
12768       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
12769     }
12770 
12771     if (UseCRC32CIntrinsics) {
12772       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
12773     }
12774 
12775     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
12776       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
12777     }
12778 
12779     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
12780       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
12781     }
12782 
12783     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
12784         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
12785       StubRoutines::_hf2f = generate_float16ToFloat();
12786       StubRoutines::_f2hf = generate_floatToFloat16();
12787     }
12788 
12789     if (InlineTypeReturnedAsFields) {
12790       StubRoutines::_load_inline_type_fields_in_regs =
12791          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
12792       StubRoutines::_store_inline_type_fields_to_buf =
12793          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
12794     }
12795 
12796   }
12797 
12798   void generate_continuation_stubs() {
12799     // Continuation stubs:
12800     StubRoutines::_cont_thaw          = generate_cont_thaw();
12801     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
12802     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
12803     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
12804   }
12805 
12806   void generate_final_stubs() {
12807     // support for verify_oop (must happen after universe_init)
12808     if (VerifyOops) {
12809       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
12810     }
12811 
12812     // arraycopy stubs used by compilers
12813     generate_arraycopy_stubs();
12814 
12815     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
12816 
12817     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
12818 
12819     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
12820     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
12821 
12822 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12823 
12824     generate_atomic_entry_points();
12825 
12826 #endif // LINUX
12827 
12828 #ifdef COMPILER2
12829     if (UseSecondarySupersTable) {
12830       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
12831       if (! InlineSecondarySupersTest) {
12832         generate_lookup_secondary_supers_table_stub();
12833       }
12834     }
12835 #endif
12836 
12837     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
12838       StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
12839     }
12840 
12841     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
12842   }
12843 
12844   void generate_compiler_stubs() {
12845 #ifdef COMPILER2
12846 
12847     if (UseSVE == 0) {
12848       generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
12849     }
12850 
12851     // array equals stub for large arrays.
12852     if (!UseSimpleArrayEquals) {
12853       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12854     }
12855 
12856     // arrays_hascode stub for large arrays.
12857     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12858     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12859     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12860     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12861     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12862 
12863     // byte_array_inflate stub for large arrays.
12864     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12865 
12866     // countPositives stub for large arrays.
12867     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12868 
12869     generate_compare_long_strings();
12870 
12871     generate_string_indexof_stubs();
12872 
12873     if (UseMultiplyToLenIntrinsic) {
12874       StubRoutines::_multiplyToLen = generate_multiplyToLen();
12875     }
12876 
12877     if (UseSquareToLenIntrinsic) {
12878       StubRoutines::_squareToLen = generate_squareToLen();
12879     }
12880 
12881     if (UseMulAddIntrinsic) {
12882       StubRoutines::_mulAdd = generate_mulAdd();
12883     }
12884 
12885     if (UseSIMDForBigIntegerShiftIntrinsics) {
12886       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12887       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
12888     }
12889 
12890     if (UseMontgomeryMultiplyIntrinsic) {
12891       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12892       address start = load_archive_data(stub_id);
12893       if (start == nullptr) {
12894         // we have to generate it
12895         StubCodeMark mark(this, stub_id);
12896         MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12897         start = g.generate_multiply();
12898         // record the stub start and end
12899         store_archive_data(stub_id, start, _masm->pc());
12900       }
12901       StubRoutines::_montgomeryMultiply = start;
12902     }
12903 
12904     if (UseMontgomerySquareIntrinsic) {
12905       StubId stub_id = StubId::stubgen_montgomerySquare_id;
12906       address start = load_archive_data(stub_id);
12907       if (start == nullptr) {
12908         // we have to generate it
12909         StubCodeMark mark(this, stub_id);
12910         MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12911         // We use generate_multiply() rather than generate_square()
12912         // because it's faster for the sizes of modulus we care about.
12913         start = g.generate_multiply();
12914         // record the stub start and end
12915         store_archive_data(stub_id, start, _masm->pc());
12916       }
12917       StubRoutines::_montgomerySquare = start;
12918     }
12919 
12920     if (UseChaCha20Intrinsics) {
12921       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12922     }
12923 
12924     if (UseKyberIntrinsics) {
12925       StubRoutines::_kyberNtt = generate_kyberNtt();
12926       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12927       StubRoutines::_kyberNttMult = generate_kyberNttMult();
12928       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12929       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12930       StubRoutines::_kyber12To16 = generate_kyber12To16();
12931       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12932     }
12933 
12934     if (UseDilithiumIntrinsics) {
12935       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12936       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12937       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12938       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12939       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12940     }
12941 
12942     if (UseBASE64Intrinsics) {
12943         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12944         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12945     }
12946 
12947     // data cache line writeback
12948     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12949     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12950 
12951     if (UseAESIntrinsics) {
12952       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12953       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12954       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12955       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12956       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12957     }
12958     if (UseGHASHIntrinsics) {
12959       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12960       StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
12961       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
12962     }
12963     if (UseAESIntrinsics && UseGHASHIntrinsics) {
12964       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12965     }
12966 
12967     if (UseMD5Intrinsics) {
12968       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12969       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12970     }
12971     if (UseSHA1Intrinsics) {
12972       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12973       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12974     }
12975     if (UseSHA256Intrinsics) {
12976       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12977       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12978     }
12979     if (UseSHA512Intrinsics) {
12980       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12981       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12982     }
12983     if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
12984       StubRoutines::_double_keccak         = generate_double_keccak();
12985       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12986       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12987     } else if (UseSHA3Intrinsics) {
12988       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12989       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12990     }
12991 
12992     if (UsePoly1305Intrinsics) {
12993       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12994     }
12995 
12996     // generate Adler32 intrinsics code
12997     if (UseAdler32Intrinsics) {
12998       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12999     }
13000 
13001 #endif // COMPILER2
13002   }
13003 
13004  public:
13005   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
13006     switch(blob_id) {
13007     case BlobId::stubgen_preuniverse_id:
13008       generate_preuniverse_stubs();
13009       break;
13010     case BlobId::stubgen_initial_id:
13011       generate_initial_stubs();
13012       break;
13013      case BlobId::stubgen_continuation_id:
13014       generate_continuation_stubs();
13015       break;
13016     case BlobId::stubgen_compiler_id:
13017       generate_compiler_stubs();
13018       break;
13019     case BlobId::stubgen_final_id:
13020       generate_final_stubs();
13021       break;
13022     default:
13023       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
13024       break;
13025     };
13026   }
13027 
13028 #if INCLUDE_CDS
13029   static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
13030     // external data defined in this file
13031 #define ADD(addr) external_addresses.append((address)(addr));
13032     ADD(_sha256_round_consts);
13033     ADD(_sha512_round_consts);
13034     ADD(_sha3_round_consts);
13035     ADD(_double_keccak_round_consts);
13036     ADD(_encodeBlock_toBase64);
13037     ADD(_encodeBlock_toBase64URL);
13038     ADD(_decodeBlock_fromBase64ForNoSIMD);
13039     ADD(_decodeBlock_fromBase64URLForNoSIMD);
13040     ADD(_decodeBlock_fromBase64ForSIMD);
13041     ADD(_decodeBlock_fromBase64URLForSIMD);
13042 #undef ADD
13043   }
13044 #endif // INCLUDE_CDS
13045 }; // end class declaration
13046 
13047 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
13048   StubGenerator g(code, blob_id, stub_data);
13049 }
13050 
13051 #if INCLUDE_CDS
13052 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
13053   StubGenerator::init_AOTAddressTable(addresses);
13054 }
13055 #endif // INCLUDE_CDS
13056 
13057 #if defined (LINUX)
13058 
13059 // Define pointers to atomic stubs and initialize them to point to the
13060 // code in atomic_aarch64.S.
13061 
13062 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
13063   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
13064     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
13065   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
13066     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
13067 
13068 DEFAULT_ATOMIC_OP(fetch_add, 4, )
13069 DEFAULT_ATOMIC_OP(fetch_add, 8, )
13070 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
13071 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
13072 DEFAULT_ATOMIC_OP(xchg, 4, )
13073 DEFAULT_ATOMIC_OP(xchg, 8, )
13074 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
13075 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
13076 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
13077 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
13078 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
13079 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
13080 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
13081 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
13082 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
13083 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
13084 
13085 #undef DEFAULT_ATOMIC_OP
13086 
13087 #endif // LINUX