1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Constant data definitions
   83 
   84 static const uint32_t _sha256_round_consts[64] = {
   85   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   86   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   87   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
   88   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
   89   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
   90   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
   91   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
   92   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
   93   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
   94   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
   95   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
   96   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
   97   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
   98   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
   99   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  100   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  101 };
  102 
  103 static const uint64_t _sha512_round_consts[80] = {
  104   0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
  105   0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
  106   0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
  107   0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
  108   0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
  109   0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
  110   0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
  111   0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
  112   0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
  113   0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
  114   0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
  115   0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
  116   0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
  117   0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
  118   0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
  119   0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
  120   0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
  121   0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
  122   0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
  123   0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
  124   0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
  125   0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
  126   0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
  127   0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
  128   0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
  129   0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
  130   0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
  131 };
  132 
  133 static const uint64_t _sha3_round_consts[24] = {
  134   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  135   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  136   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  137   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  138   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  139   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  140   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  141   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  142 };
  143 
  144 static const uint64_t _double_keccak_round_consts[24] = {
  145   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  146   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  147   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  148   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  149   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  150   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  151   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  152   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  153 };
  154 
  155 static const char _encodeBlock_toBase64[64] = {
  156   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  157   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  158   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  159   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  160   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
  161 };
  162 
  163 static const char _encodeBlock_toBase64URL[64] = {
  164   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  165   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  166   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  167   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  168   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
  169 };
  170 
  171 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
  172 // except the trailing character '=' is also treated illegal value in this intrinsic. That
  173 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
  174 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
  175   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  176   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  177   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  178   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  179   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  180   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
  181   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  182   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  183   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  184   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  185   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  186   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  187   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  188   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  189   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  190   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  191 };
  192 
  193 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
  194   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  195   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  196   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  197   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  198   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  199   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
  200   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  201   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  202   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  203   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  204   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  205   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  206   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  207   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  208   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  209   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  210 };
  211 
  212 // A legal value of base64 code is in range [0, 127].  We need two lookups
  213 // with tbl/tbx and combine them to get the decode data. The 1st table vector
  214 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
  215 // table vector lookup use tbx, out of range indices are unchanged in
  216 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
  217 // The value of index 64 is set to 0, so that we know that we already get the
  218 // decoded data with the 1st lookup.
  219 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
  220   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  221   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  222   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  223   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  224   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  225   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  226   255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  227   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  228 };
  229 
  230 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
  231   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  232   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  233   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  234   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  235   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  236   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  237   63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  238   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  239 };
  240 
  241 
  242 // Stub Code definitions
  243 
  244 class StubGenerator: public StubCodeGenerator {
  245  private:
  246 
  247 #ifdef PRODUCT
  248 #define inc_counter_np(counter) ((void)0)
  249 #else
  250   void inc_counter_np_(uint& counter) {
  251     __ incrementw(ExternalAddress((address)&counter));
  252   }
  253 #define inc_counter_np(counter) \
  254   BLOCK_COMMENT("inc_counter " #counter); \
  255   inc_counter_np_(counter);
  256 #endif
  257 
  258   // Call stubs are used to call Java from C
  259   //
  260   // Arguments:
  261   //    c_rarg0:   call wrapper address                   address
  262   //    c_rarg1:   result                                 address
  263   //    c_rarg2:   result type                            BasicType
  264   //    c_rarg3:   method                                 Method*
  265   //    c_rarg4:   (interpreter) entry point              address
  266   //    c_rarg5:   parameters                             intptr_t*
  267   //    c_rarg6:   parameter size (in words)              int
  268   //    c_rarg7:   thread                                 Thread*
  269   //
  270   // There is no return from the stub itself as any Java result
  271   // is written to result
  272   //
  273   // we save r30 (lr) as the return PC at the base of the frame and
  274   // link r29 (fp) below it as the frame pointer installing sp (r31)
  275   // into fp.
  276   //
  277   // we save r0-r7, which accounts for all the c arguments.
  278   //
  279   // TODO: strictly do we need to save them all? they are treated as
  280   // volatile by C so could we omit saving the ones we are going to
  281   // place in global registers (thread? method?) or those we only use
  282   // during setup of the Java call?
  283   //
  284   // we don't need to save r8 which C uses as an indirect result location
  285   // return register.
  286   //
  287   // we don't need to save r9-r15 which both C and Java treat as
  288   // volatile
  289   //
  290   // we don't need to save r16-18 because Java does not use them
  291   //
  292   // we save r19-r28 which Java uses as scratch registers and C
  293   // expects to be callee-save
  294   //
  295   // we save the bottom 64 bits of each value stored in v8-v15; it is
  296   // the responsibility of the caller to preserve larger values.
  297   //
  298   // so the stub frame looks like this when we enter Java code
  299   //
  300   //     [ return_from_Java     ] <--- sp
  301   //     [ argument word n      ]
  302   //      ...
  303   // -29 [ argument word 1      ]
  304   // -28 [ saved Floating-point Control Register ]
  305   // -26 [ saved v15            ] <--- sp_after_call
  306   // -25 [ saved v14            ]
  307   // -24 [ saved v13            ]
  308   // -23 [ saved v12            ]
  309   // -22 [ saved v11            ]
  310   // -21 [ saved v10            ]
  311   // -20 [ saved v9             ]
  312   // -19 [ saved v8             ]
  313   // -18 [ saved r28            ]
  314   // -17 [ saved r27            ]
  315   // -16 [ saved r26            ]
  316   // -15 [ saved r25            ]
  317   // -14 [ saved r24            ]
  318   // -13 [ saved r23            ]
  319   // -12 [ saved r22            ]
  320   // -11 [ saved r21            ]
  321   // -10 [ saved r20            ]
  322   //  -9 [ saved r19            ]
  323   //  -8 [ call wrapper    (r0) ]
  324   //  -7 [ result          (r1) ]
  325   //  -6 [ result type     (r2) ]
  326   //  -5 [ method          (r3) ]
  327   //  -4 [ entry point     (r4) ]
  328   //  -3 [ parameters      (r5) ]
  329   //  -2 [ parameter size  (r6) ]
  330   //  -1 [ thread (r7)          ]
  331   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  332   //   1 [ saved lr       (r30) ]
  333 
  334   // Call stub stack layout word offsets from fp
  335   enum call_stub_layout {
  336     sp_after_call_off  = -28,
  337 
  338     fpcr_off           = sp_after_call_off,
  339     d15_off            = -26,
  340     d13_off            = -24,
  341     d11_off            = -22,
  342     d9_off             = -20,
  343 
  344     r28_off            = -18,
  345     r26_off            = -16,
  346     r24_off            = -14,
  347     r22_off            = -12,
  348     r20_off            = -10,
  349     call_wrapper_off   =  -8,
  350     result_off         =  -7,
  351     result_type_off    =  -6,
  352     method_off         =  -5,
  353     entry_point_off    =  -4,
  354     parameter_size_off =  -2,
  355     thread_off         =  -1,
  356     fp_f               =   0,
  357     retaddr_off        =   1,
  358   };
  359 
  360   address generate_call_stub(address& return_address) {
  361     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  362            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  363            "adjust this code");
  364 
  365     StubId stub_id = StubId::stubgen_call_stub_id;
  366     GrowableArray<address> entries;
  367     int entry_count = StubInfo::entry_count(stub_id);
  368     assert(entry_count == 2, "sanity check");
  369     address start = load_archive_data(stub_id, &entries);
  370     if (start != nullptr) {
  371       assert(entries.length() == 1, "expected 1 extra entry");
  372       return_address = entries.at(0);
  373       return start;
  374     }
  375     StubCodeMark mark(this, stub_id);
  376     start = __ pc();
  377 
  378     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  379 
  380     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  381     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  382     const Address result        (rfp, result_off         * wordSize);
  383     const Address result_type   (rfp, result_type_off    * wordSize);
  384     const Address method        (rfp, method_off         * wordSize);
  385     const Address entry_point   (rfp, entry_point_off    * wordSize);
  386     const Address parameter_size(rfp, parameter_size_off * wordSize);
  387 
  388     const Address thread        (rfp, thread_off         * wordSize);
  389 
  390     const Address d15_save      (rfp, d15_off * wordSize);
  391     const Address d13_save      (rfp, d13_off * wordSize);
  392     const Address d11_save      (rfp, d11_off * wordSize);
  393     const Address d9_save       (rfp, d9_off * wordSize);
  394 
  395     const Address r28_save      (rfp, r28_off * wordSize);
  396     const Address r26_save      (rfp, r26_off * wordSize);
  397     const Address r24_save      (rfp, r24_off * wordSize);
  398     const Address r22_save      (rfp, r22_off * wordSize);
  399     const Address r20_save      (rfp, r20_off * wordSize);
  400 
  401     // stub code
  402 
  403     address aarch64_entry = __ pc();
  404 
  405     // set up frame and move sp to end of save area
  406     __ enter();
  407     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  408 
  409     // save register parameters and Java scratch/global registers
  410     // n.b. we save thread even though it gets installed in
  411     // rthread because we want to sanity check rthread later
  412     __ str(c_rarg7,  thread);
  413     __ strw(c_rarg6, parameter_size);
  414     __ stp(c_rarg4, c_rarg5,  entry_point);
  415     __ stp(c_rarg2, c_rarg3,  result_type);
  416     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  417 
  418     __ stp(r20, r19,   r20_save);
  419     __ stp(r22, r21,   r22_save);
  420     __ stp(r24, r23,   r24_save);
  421     __ stp(r26, r25,   r26_save);
  422     __ stp(r28, r27,   r28_save);
  423 
  424     __ stpd(v9,  v8,   d9_save);
  425     __ stpd(v11, v10,  d11_save);
  426     __ stpd(v13, v12,  d13_save);
  427     __ stpd(v15, v14,  d15_save);
  428 
  429     __ get_fpcr(rscratch1);
  430     __ str(rscratch1, fpcr_save);
  431     // Set FPCR to the state we need. We do want Round to Nearest. We
  432     // don't want non-IEEE rounding modes or floating-point traps.
  433     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  434     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  435     __ set_fpcr(rscratch1);
  436 
  437     // install Java thread in global register now we have saved
  438     // whatever value it held
  439     __ mov(rthread, c_rarg7);
  440     // And method
  441     __ mov(rmethod, c_rarg3);
  442 
  443     // set up the heapbase register
  444     __ reinit_heapbase();
  445 
  446 #ifdef ASSERT
  447     // make sure we have no pending exceptions
  448     {
  449       Label L;
  450       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  451       __ cmp(rscratch1, (u1)NULL_WORD);
  452       __ br(Assembler::EQ, L);
  453       __ stop("StubRoutines::call_stub: entered with pending exception");
  454       __ BIND(L);
  455     }
  456 #endif
  457     // pass parameters if any
  458     __ mov(esp, sp);
  459     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  460     __ andr(sp, rscratch1, -2 * wordSize);
  461 
  462     BLOCK_COMMENT("pass parameters if any");
  463     Label parameters_done;
  464     // parameter count is still in c_rarg6
  465     // and parameter pointer identifying param 1 is in c_rarg5
  466     __ cbzw(c_rarg6, parameters_done);
  467 
  468     address loop = __ pc();
  469     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  470     __ subsw(c_rarg6, c_rarg6, 1);
  471     __ push(rscratch1);
  472     __ br(Assembler::GT, loop);
  473 
  474     __ BIND(parameters_done);
  475 
  476     // call Java entry -- passing methdoOop, and current sp
  477     //      rmethod: Method*
  478     //      r19_sender_sp: sender sp
  479     BLOCK_COMMENT("call Java function");
  480     __ mov(r19_sender_sp, sp);
  481     __ blr(c_rarg4);
  482 
  483     // we do this here because the notify will already have been done
  484     // if we get to the next instruction via an exception
  485     //
  486     // n.b. adding this instruction here affects the calculation of
  487     // whether or not a routine returns to the call stub (used when
  488     // doing stack walks) since the normal test is to check the return
  489     // pc against the address saved below. so we may need to allow for
  490     // this extra instruction in the check.
  491 
  492     // save current address for use by exception handling code
  493 
  494     return_address = __ pc();
  495     entries.append(return_address);
  496 
  497     // store result depending on type (everything that is not
  498     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  499     // n.b. this assumes Java returns an integral result in r0
  500     // and a floating result in j_farg0
  501     // All of j_rargN may be used to return inline type fields so be careful
  502     // not to clobber those.
  503     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  504     // assignment of Rresult below.
  505     Register Rresult = r14, Rresult_type = r15;
  506     __ ldr(Rresult, result);
  507     Label is_long, is_float, is_double, check_prim, exit;
  508     __ ldr(Rresult_type, result_type);
  509     __ cmp(Rresult_type, (u1)T_OBJECT);
  510     __ br(Assembler::EQ, check_prim);
  511     __ cmp(Rresult_type, (u1)T_LONG);
  512     __ br(Assembler::EQ, is_long);
  513     __ cmp(Rresult_type, (u1)T_FLOAT);
  514     __ br(Assembler::EQ, is_float);
  515     __ cmp(Rresult_type, (u1)T_DOUBLE);
  516     __ br(Assembler::EQ, is_double);
  517 
  518     // handle T_INT case
  519     __ strw(r0, Address(Rresult));
  520 
  521     __ BIND(exit);
  522 
  523     // pop parameters
  524     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  525 
  526 #ifdef ASSERT
  527     // verify that threads correspond
  528     {
  529       Label L, S;
  530       __ ldr(rscratch1, thread);
  531       __ cmp(rthread, rscratch1);
  532       __ br(Assembler::NE, S);
  533       __ get_thread(rscratch1);
  534       __ cmp(rthread, rscratch1);
  535       __ br(Assembler::EQ, L);
  536       __ BIND(S);
  537       __ stop("StubRoutines::call_stub: threads must correspond");
  538       __ BIND(L);
  539     }
  540 #endif
  541 
  542     __ pop_cont_fastpath(rthread);
  543 
  544     // restore callee-save registers
  545     __ ldpd(v15, v14,  d15_save);
  546     __ ldpd(v13, v12,  d13_save);
  547     __ ldpd(v11, v10,  d11_save);
  548     __ ldpd(v9,  v8,   d9_save);
  549 
  550     __ ldp(r28, r27,   r28_save);
  551     __ ldp(r26, r25,   r26_save);
  552     __ ldp(r24, r23,   r24_save);
  553     __ ldp(r22, r21,   r22_save);
  554     __ ldp(r20, r19,   r20_save);
  555 
  556     // restore fpcr
  557     __ ldr(rscratch1,  fpcr_save);
  558     __ set_fpcr(rscratch1);
  559 
  560     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  561     __ ldrw(c_rarg2, result_type);
  562     __ ldr(c_rarg3,  method);
  563     __ ldp(c_rarg4, c_rarg5,  entry_point);
  564     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  565 
  566     // leave frame and return to caller
  567     __ leave();
  568     __ ret(lr);
  569 
  570     // handle return types different from T_INT
  571     __ BIND(check_prim);
  572     if (InlineTypeReturnedAsFields) {
  573       // Check for scalarized return value
  574       __ tbz(r0, 0, is_long);
  575       // Load pack handler address
  576       __ andr(rscratch1, r0, -2);
  577       __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
  578       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  579       __ blr(rscratch1);
  580       __ b(exit);
  581     }
  582 
  583     __ BIND(is_long);
  584     __ str(r0, Address(Rresult, 0));
  585     __ br(Assembler::AL, exit);
  586 
  587     __ BIND(is_float);
  588     __ strs(j_farg0, Address(Rresult, 0));
  589     __ br(Assembler::AL, exit);
  590 
  591     __ BIND(is_double);
  592     __ strd(j_farg0, Address(Rresult, 0));
  593     __ br(Assembler::AL, exit);
  594 
  595     // record the stub entry and end plus the auxiliary entry
  596     store_archive_data(stub_id, start, __ pc(), &entries);
  597 
  598     return start;
  599   }
  600 
  601   // Return point for a Java call if there's an exception thrown in
  602   // Java code.  The exception is caught and transformed into a
  603   // pending exception stored in JavaThread that can be tested from
  604   // within the VM.
  605   //
  606   // Note: Usually the parameters are removed by the callee. In case
  607   // of an exception crossing an activation frame boundary, that is
  608   // not the case if the callee is compiled code => need to setup the
  609   // rsp.
  610   //
  611   // r0: exception oop
  612 
  613   address generate_catch_exception() {
  614     StubId stub_id = StubId::stubgen_catch_exception_id;
  615     int entry_count = StubInfo::entry_count(stub_id);
  616     assert(entry_count == 1, "sanity check");
  617     address start = load_archive_data(stub_id);
  618     if (start != nullptr) {
  619       return start;
  620     }
  621     StubCodeMark mark(this, stub_id);
  622     start = __ pc();
  623 
  624     // same as in generate_call_stub():
  625     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  626     const Address thread        (rfp, thread_off         * wordSize);
  627 
  628 #ifdef ASSERT
  629     // verify that threads correspond
  630     {
  631       Label L, S;
  632       __ ldr(rscratch1, thread);
  633       __ cmp(rthread, rscratch1);
  634       __ br(Assembler::NE, S);
  635       __ get_thread(rscratch1);
  636       __ cmp(rthread, rscratch1);
  637       __ br(Assembler::EQ, L);
  638       __ bind(S);
  639       __ stop("StubRoutines::catch_exception: threads must correspond");
  640       __ bind(L);
  641     }
  642 #endif
  643 
  644     // set pending exception
  645     __ verify_oop(r0);
  646 
  647     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  648     // special case -- add file name string to AOT address table
  649     address file = (address)AOTCodeCache::add_C_string(__FILE__);
  650     __ lea(rscratch1, ExternalAddress(file));
  651     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  652     __ movw(rscratch1, (int)__LINE__);
  653     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  654 
  655     // complete return to VM
  656     assert(StubRoutines::_call_stub_return_address != nullptr,
  657            "_call_stub_return_address must have been generated before");
  658     __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
  659 
  660     // record the stub entry and end
  661     store_archive_data(stub_id, start, __ pc());
  662 
  663     return start;
  664   }
  665 
  666   // Continuation point for runtime calls returning with a pending
  667   // exception.  The pending exception check happened in the runtime
  668   // or native call stub.  The pending exception in Thread is
  669   // converted into a Java-level exception.
  670   //
  671   // Contract with Java-level exception handlers:
  672   // r0: exception
  673   // r3: throwing pc
  674   //
  675   // NOTE: At entry of this stub, exception-pc must be in LR !!
  676 
  677   // NOTE: this is always used as a jump target within generated code
  678   // so it just needs to be generated code with no x86 prolog
  679 
  680   address generate_forward_exception() {
  681     StubId stub_id = StubId::stubgen_forward_exception_id;
  682     int entry_count = StubInfo::entry_count(stub_id);
  683     assert(entry_count == 1, "sanity check");
  684     address start = load_archive_data(stub_id);
  685     if (start != nullptr) {
  686       return start;
  687     }
  688     StubCodeMark mark(this, stub_id);
  689     start = __ pc();
  690 
  691     // Upon entry, LR points to the return address returning into
  692     // Java (interpreted or compiled) code; i.e., the return address
  693     // becomes the throwing pc.
  694     //
  695     // Arguments pushed before the runtime call are still on the stack
  696     // but the exception handler will reset the stack pointer ->
  697     // ignore them.  A potential result in registers can be ignored as
  698     // well.
  699 
  700 #ifdef ASSERT
  701     // make sure this code is only executed if there is a pending exception
  702     {
  703       Label L;
  704       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  705       __ cbnz(rscratch1, L);
  706       __ stop("StubRoutines::forward exception: no pending exception (1)");
  707       __ bind(L);
  708     }
  709 #endif
  710 
  711     // compute exception handler into r19
  712 
  713     // call the VM to find the handler address associated with the
  714     // caller address. pass thread in r0 and caller pc (ret address)
  715     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  716     // the stack.
  717     __ mov(c_rarg1, lr);
  718     // lr will be trashed by the VM call so we move it to R19
  719     // (callee-saved) because we also need to pass it to the handler
  720     // returned by this call.
  721     __ mov(r19, lr);
  722     BLOCK_COMMENT("call exception_handler_for_return_address");
  723     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  724                          SharedRuntime::exception_handler_for_return_address),
  725                     rthread, c_rarg1);
  726     // Reinitialize the ptrue predicate register, in case the external runtime
  727     // call clobbers ptrue reg, as we may return to SVE compiled code.
  728     __ reinitialize_ptrue();
  729 
  730     // we should not really care that lr is no longer the callee
  731     // address. we saved the value the handler needs in r19 so we can
  732     // just copy it to r3. however, the C2 handler will push its own
  733     // frame and then calls into the VM and the VM code asserts that
  734     // the PC for the frame above the handler belongs to a compiled
  735     // Java method. So, we restore lr here to satisfy that assert.
  736     __ mov(lr, r19);
  737     // setup r0 & r3 & clear pending exception
  738     __ mov(r3, r19);
  739     __ mov(r19, r0);
  740     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  741     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  742 
  743 #ifdef ASSERT
  744     // make sure exception is set
  745     {
  746       Label L;
  747       __ cbnz(r0, L);
  748       __ stop("StubRoutines::forward exception: no pending exception (2)");
  749       __ bind(L);
  750     }
  751 #endif
  752 
  753     // continue at exception handler
  754     // r0: exception
  755     // r3: throwing pc
  756     // r19: exception handler
  757     __ verify_oop(r0);
  758     __ br(r19);
  759 
  760     // record the stub entry and end
  761     store_archive_data(stub_id, start, __ pc());
  762 
  763     return start;
  764   }
  765 
  766   // Non-destructive plausibility checks for oops
  767   //
  768   // Arguments:
  769   //    r0: oop to verify
  770   //    rscratch1: error message
  771   //
  772   // Stack after saving c_rarg3:
  773   //    [tos + 0]: saved c_rarg3
  774   //    [tos + 1]: saved c_rarg2
  775   //    [tos + 2]: saved lr
  776   //    [tos + 3]: saved rscratch2
  777   //    [tos + 4]: saved r0
  778   //    [tos + 5]: saved rscratch1
  779   address generate_verify_oop() {
  780     StubId stub_id = StubId::stubgen_verify_oop_id;
  781     int entry_count = StubInfo::entry_count(stub_id);
  782     assert(entry_count == 1, "sanity check");
  783     address start = load_archive_data(stub_id);
  784     if (start != nullptr) {
  785       return start;
  786     }
  787     StubCodeMark mark(this, stub_id);
  788     start = __ pc();
  789 
  790     Label exit, error;
  791 
  792     // save c_rarg2 and c_rarg3
  793     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  794 
  795     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  796     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  797     __ ldr(c_rarg3, Address(c_rarg2));
  798     __ add(c_rarg3, c_rarg3, 1);
  799     __ str(c_rarg3, Address(c_rarg2));
  800 
  801     // object is in r0
  802     // make sure object is 'reasonable'
  803     __ cbz(r0, exit); // if obj is null it is OK
  804 
  805     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  806     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  807 
  808     // return if everything seems ok
  809     __ bind(exit);
  810 
  811     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  812     __ ret(lr);
  813 
  814     // handle errors
  815     __ bind(error);
  816     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  817 
  818     __ push(RegSet::range(r0, r29), sp);
  819     // debug(char* msg, int64_t pc, int64_t regs[])
  820     __ mov(c_rarg0, rscratch1);      // pass address of error message
  821     __ mov(c_rarg1, lr);             // pass return address
  822     __ mov(c_rarg2, sp);             // pass address of regs on stack
  823 #ifndef PRODUCT
  824     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  825 #endif
  826     BLOCK_COMMENT("call MacroAssembler::debug");
  827     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  828     __ blr(rscratch1);
  829     __ hlt(0);
  830 
  831     // record the stub entry and end
  832     store_archive_data(stub_id, start, __ pc());
  833 
  834     return start;
  835   }
  836 
  837   // Generate indices for iota vector.
  838   void generate_iota_indices(StubId stub_id) {
  839     GrowableArray<address> entries;
  840     int entry_count = StubInfo::entry_count(stub_id);
  841     assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
  842     address start = load_archive_data(stub_id, &entries);
  843     if (start != nullptr) {
  844       assert(entries.length() == entry_count - 1,
  845              "unexpected entries count %d", entries.length());
  846       StubRoutines::aarch64::_vector_iota_indices[0] = start;
  847       for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  848         StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  849       }
  850       return;
  851     }
  852     __ align(CodeEntryAlignment);
  853     StubCodeMark mark(this, stub_id);
  854     start = __ pc();
  855     // B
  856     __ emit_data64(0x0706050403020100, relocInfo::none);
  857     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  858     entries.append(__ pc());
  859     // H
  860     __ emit_data64(0x0003000200010000, relocInfo::none);
  861     __ emit_data64(0x0007000600050004, relocInfo::none);
  862     entries.append(__ pc());
  863     // S
  864     __ emit_data64(0x0000000100000000, relocInfo::none);
  865     __ emit_data64(0x0000000300000002, relocInfo::none);
  866     entries.append(__ pc());
  867     // D
  868     __ emit_data64(0x0000000000000000, relocInfo::none);
  869     __ emit_data64(0x0000000000000001, relocInfo::none);
  870     entries.append(__ pc());
  871     // S - FP
  872     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  873     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  874     entries.append(__ pc());
  875     // D - FP
  876     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  877     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  878 
  879     // record the stub entry and end
  880     store_archive_data(stub_id, start, __ pc(), &entries);
  881 
  882     // install the entry addresses in the entry array
  883     assert(entries.length() == entry_count - 1,
  884            "unexpected entries count %d", entries.length());
  885     StubRoutines::aarch64::_vector_iota_indices[0] = start;
  886     for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  887       StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  888     }
  889   }
  890 
  891   // The inner part of zero_words().  This is the bulk operation,
  892   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  893   // caller is responsible for zeroing the last few words.
  894   //
  895   // Inputs:
  896   // r10: the HeapWord-aligned base address of an array to zero.
  897   // r11: the count in HeapWords, r11 > 0.
  898   //
  899   // Returns r10 and r11, adjusted for the caller to clear.
  900   // r10: the base address of the tail of words left to clear.
  901   // r11: the number of words in the tail.
  902   //      r11 < MacroAssembler::zero_words_block_size.
  903 
  904   address generate_zero_blocks() {
  905     StubId stub_id = StubId::stubgen_zero_blocks_id;
  906     int entry_count = StubInfo::entry_count(stub_id);
  907     assert(entry_count == 1, "sanity check");
  908     address start = load_archive_data(stub_id);
  909     if (start != nullptr) {
  910       return start;
  911     }
  912     __ align(CodeEntryAlignment);
  913     StubCodeMark mark(this, stub_id);
  914     Label done;
  915     Label base_aligned;
  916 
  917     Register base = r10, cnt = r11;
  918 
  919     start = __ pc();
  920 
  921     if (UseBlockZeroing) {
  922       int zva_length = VM_Version::zva_length();
  923 
  924       // Ensure ZVA length can be divided by 16. This is required by
  925       // the subsequent operations.
  926       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  927 
  928       __ tbz(base, 3, base_aligned);
  929       __ str(zr, Address(__ post(base, 8)));
  930       __ sub(cnt, cnt, 1);
  931       __ bind(base_aligned);
  932 
  933       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  934       // alignment.
  935       Label small;
  936       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  937       __ subs(rscratch1, cnt, low_limit >> 3);
  938       __ br(Assembler::LT, small);
  939       __ zero_dcache_blocks(base, cnt);
  940       __ bind(small);
  941     }
  942 
  943     {
  944       // Number of stp instructions we'll unroll
  945       const int unroll =
  946         MacroAssembler::zero_words_block_size / 2;
  947       // Clear the remaining blocks.
  948       Label loop;
  949       __ subs(cnt, cnt, unroll * 2);
  950       __ br(Assembler::LT, done);
  951       __ bind(loop);
  952       for (int i = 0; i < unroll; i++)
  953         __ stp(zr, zr, __ post(base, 16));
  954       __ subs(cnt, cnt, unroll * 2);
  955       __ br(Assembler::GE, loop);
  956       __ bind(done);
  957       __ add(cnt, cnt, unroll * 2);
  958     }
  959 
  960     __ ret(lr);
  961 
  962     // record the stub entry and end
  963     store_archive_data(stub_id, start, __ pc());
  964 
  965     return start;
  966   }
  967 
  968 
  969   typedef enum {
  970     copy_forwards = 1,
  971     copy_backwards = -1
  972   } copy_direction;
  973 
  974   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  975   // for arraycopy stubs.
  976   class ArrayCopyBarrierSetHelper : StackObj {
  977     BarrierSetAssembler* _bs_asm;
  978     MacroAssembler* _masm;
  979     DecoratorSet _decorators;
  980     BasicType _type;
  981     Register _gct1;
  982     Register _gct2;
  983     Register _gct3;
  984     FloatRegister _gcvt1;
  985     FloatRegister _gcvt2;
  986     FloatRegister _gcvt3;
  987 
  988   public:
  989     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  990                               DecoratorSet decorators,
  991                               BasicType type,
  992                               Register gct1,
  993                               Register gct2,
  994                               Register gct3,
  995                               FloatRegister gcvt1,
  996                               FloatRegister gcvt2,
  997                               FloatRegister gcvt3)
  998       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  999         _masm(masm),
 1000         _decorators(decorators),
 1001         _type(type),
 1002         _gct1(gct1),
 1003         _gct2(gct2),
 1004         _gct3(gct3),
 1005         _gcvt1(gcvt1),
 1006         _gcvt2(gcvt2),
 1007         _gcvt3(gcvt3) {
 1008     }
 1009 
 1010     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 1011       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 1012                             dst1, dst2, src,
 1013                             _gct1, _gct2, _gcvt1);
 1014     }
 1015 
 1016     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 1017       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 1018                              dst, src1, src2,
 1019                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 1020     }
 1021 
 1022     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 1023       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 1024                             dst1, dst2, src,
 1025                             _gct1);
 1026     }
 1027 
 1028     void copy_store_at_16(Address dst, Register src1, Register src2) {
 1029       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 1030                              dst, src1, src2,
 1031                              _gct1, _gct2, _gct3);
 1032     }
 1033 
 1034     void copy_load_at_8(Register dst, Address src) {
 1035       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 1036                             dst, noreg, src,
 1037                             _gct1);
 1038     }
 1039 
 1040     void copy_store_at_8(Address dst, Register src) {
 1041       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 1042                              dst, src, noreg,
 1043                              _gct1, _gct2, _gct3);
 1044     }
 1045   };
 1046 
 1047   // Bulk copy of blocks of 8 words.
 1048   //
 1049   // count is a count of words.
 1050   //
 1051   // Precondition: count >= 8
 1052   //
 1053   // Postconditions:
 1054   //
 1055   // The least significant bit of count contains the remaining count
 1056   // of words to copy.  The rest of count is trash.
 1057   //
 1058   // s and d are adjusted to point to the remaining words to copy
 1059   //
 1060   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
 1061     int entry_count = StubInfo::entry_count(stub_id);
 1062     assert(entry_count == 1, "sanity check");
 1063     address start = load_archive_data(stub_id);
 1064     if (start != nullptr) {
 1065       return start;
 1066     }
 1067     BasicType type;
 1068     copy_direction direction;
 1069 
 1070     switch (stub_id) {
 1071     case StubId::stubgen_copy_byte_f_id:
 1072       direction = copy_forwards;
 1073       type = T_BYTE;
 1074       break;
 1075     case StubId::stubgen_copy_byte_b_id:
 1076       direction = copy_backwards;
 1077       type = T_BYTE;
 1078       break;
 1079     case StubId::stubgen_copy_oop_f_id:
 1080       direction = copy_forwards;
 1081       type = T_OBJECT;
 1082       break;
 1083     case StubId::stubgen_copy_oop_b_id:
 1084       direction = copy_backwards;
 1085       type = T_OBJECT;
 1086       break;
 1087     case StubId::stubgen_copy_oop_uninit_f_id:
 1088       direction = copy_forwards;
 1089       type = T_OBJECT;
 1090       break;
 1091     case StubId::stubgen_copy_oop_uninit_b_id:
 1092       direction = copy_backwards;
 1093       type = T_OBJECT;
 1094       break;
 1095     default:
 1096       ShouldNotReachHere();
 1097     }
 1098 
 1099     int unit = wordSize * direction;
 1100     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 1101 
 1102     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 1103       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 1104     const Register stride = r14;
 1105     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1106     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1107     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1108 
 1109     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 1110     assert_different_registers(s, d, count, rscratch1, rscratch2);
 1111 
 1112     Label again, drain;
 1113 
 1114     __ align(CodeEntryAlignment);
 1115 
 1116     StubCodeMark mark(this, stub_id);
 1117 
 1118     start = __ pc();
 1119 
 1120     Label unaligned_copy_long;
 1121     if (AvoidUnalignedAccesses) {
 1122       __ tbnz(d, 3, unaligned_copy_long);
 1123     }
 1124 
 1125     if (direction == copy_forwards) {
 1126       __ sub(s, s, bias);
 1127       __ sub(d, d, bias);
 1128     }
 1129 
 1130 #ifdef ASSERT
 1131     // Make sure we are never given < 8 words
 1132     {
 1133       Label L;
 1134       __ cmp(count, (u1)8);
 1135       __ br(Assembler::GE, L);
 1136       __ stop("genrate_copy_longs called with < 8 words");
 1137       __ bind(L);
 1138     }
 1139 #endif
 1140 
 1141     // Fill 8 registers
 1142     if (UseSIMDForMemoryOps) {
 1143       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1144       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1145     } else {
 1146       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1147       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1148       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1149       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1150     }
 1151 
 1152     __ subs(count, count, 16);
 1153     __ br(Assembler::LO, drain);
 1154 
 1155     int prefetch = PrefetchCopyIntervalInBytes;
 1156     bool use_stride = false;
 1157     if (direction == copy_backwards) {
 1158       use_stride = prefetch > 256;
 1159       prefetch = -prefetch;
 1160       if (use_stride) __ mov(stride, prefetch);
 1161     }
 1162 
 1163     __ bind(again);
 1164 
 1165     if (PrefetchCopyIntervalInBytes > 0)
 1166       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1167 
 1168     if (UseSIMDForMemoryOps) {
 1169       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1170       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1171       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1172       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1173     } else {
 1174       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1175       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1176       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1177       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1178       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1179       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1180       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1181       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1182     }
 1183 
 1184     __ subs(count, count, 8);
 1185     __ br(Assembler::HS, again);
 1186 
 1187     // Drain
 1188     __ bind(drain);
 1189     if (UseSIMDForMemoryOps) {
 1190       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1191       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1192     } else {
 1193       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1194       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1195       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1196       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1197     }
 1198 
 1199     {
 1200       Label L1, L2;
 1201       __ tbz(count, exact_log2(4), L1);
 1202       if (UseSIMDForMemoryOps) {
 1203         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 1204         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 1205       } else {
 1206         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1207         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1208         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1209         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 1210       }
 1211       __ bind(L1);
 1212 
 1213       if (direction == copy_forwards) {
 1214         __ add(s, s, bias);
 1215         __ add(d, d, bias);
 1216       }
 1217 
 1218       __ tbz(count, 1, L2);
 1219       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 1220       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 1221       __ bind(L2);
 1222     }
 1223 
 1224     __ ret(lr);
 1225 
 1226     if (AvoidUnalignedAccesses) {
 1227       Label drain, again;
 1228       // Register order for storing. Order is different for backward copy.
 1229 
 1230       __ bind(unaligned_copy_long);
 1231 
 1232       // source address is even aligned, target odd aligned
 1233       //
 1234       // when forward copying word pairs we read long pairs at offsets
 1235       // {0, 2, 4, 6} (in long words). when backwards copying we read
 1236       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 1237       // address by -2 in the forwards case so we can compute the
 1238       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 1239       // or -1.
 1240       //
 1241       // when forward copying we need to store 1 word, 3 pairs and
 1242       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 1243       // zero offset We adjust the destination by -1 which means we
 1244       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1245       //
 1246       // When backwards copyng we need to store 1 word, 3 pairs and
 1247       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1248       // offsets {1, 3, 5, 7, 8} * unit.
 1249 
 1250       if (direction == copy_forwards) {
 1251         __ sub(s, s, 16);
 1252         __ sub(d, d, 8);
 1253       }
 1254 
 1255       // Fill 8 registers
 1256       //
 1257       // for forwards copy s was offset by -16 from the original input
 1258       // value of s so the register contents are at these offsets
 1259       // relative to the 64 bit block addressed by that original input
 1260       // and so on for each successive 64 byte block when s is updated
 1261       //
 1262       // t0 at offset 0,  t1 at offset 8
 1263       // t2 at offset 16, t3 at offset 24
 1264       // t4 at offset 32, t5 at offset 40
 1265       // t6 at offset 48, t7 at offset 56
 1266 
 1267       // for backwards copy s was not offset so the register contents
 1268       // are at these offsets into the preceding 64 byte block
 1269       // relative to that original input and so on for each successive
 1270       // preceding 64 byte block when s is updated. this explains the
 1271       // slightly counter-intuitive looking pattern of register usage
 1272       // in the stp instructions for backwards copy.
 1273       //
 1274       // t0 at offset -16, t1 at offset -8
 1275       // t2 at offset -32, t3 at offset -24
 1276       // t4 at offset -48, t5 at offset -40
 1277       // t6 at offset -64, t7 at offset -56
 1278 
 1279       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1280       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1281       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1282       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1283 
 1284       __ subs(count, count, 16);
 1285       __ br(Assembler::LO, drain);
 1286 
 1287       int prefetch = PrefetchCopyIntervalInBytes;
 1288       bool use_stride = false;
 1289       if (direction == copy_backwards) {
 1290         use_stride = prefetch > 256;
 1291         prefetch = -prefetch;
 1292         if (use_stride) __ mov(stride, prefetch);
 1293       }
 1294 
 1295       __ bind(again);
 1296 
 1297       if (PrefetchCopyIntervalInBytes > 0)
 1298         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1299 
 1300       if (direction == copy_forwards) {
 1301         // allowing for the offset of -8 the store instructions place
 1302         // registers into the target 64 bit block at the following
 1303         // offsets
 1304         //
 1305         // t0 at offset 0
 1306         // t1 at offset 8,  t2 at offset 16
 1307         // t3 at offset 24, t4 at offset 32
 1308         // t5 at offset 40, t6 at offset 48
 1309         // t7 at offset 56
 1310 
 1311         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1312         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1313         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1314         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1315         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1316         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1317         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1318         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1319         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1320       } else {
 1321         // d was not offset when we started so the registers are
 1322         // written into the 64 bit block preceding d with the following
 1323         // offsets
 1324         //
 1325         // t1 at offset -8
 1326         // t3 at offset -24, t0 at offset -16
 1327         // t5 at offset -48, t2 at offset -32
 1328         // t7 at offset -56, t4 at offset -48
 1329         //                   t6 at offset -64
 1330         //
 1331         // note that this matches the offsets previously noted for the
 1332         // loads
 1333 
 1334         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1335         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1336         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1337         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1338         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1339         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1340         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1341         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1342         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1343       }
 1344 
 1345       __ subs(count, count, 8);
 1346       __ br(Assembler::HS, again);
 1347 
 1348       // Drain
 1349       //
 1350       // this uses the same pattern of offsets and register arguments
 1351       // as above
 1352       __ bind(drain);
 1353       if (direction == copy_forwards) {
 1354         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1355         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1356         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1357         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1358         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1359       } else {
 1360         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1361         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1362         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1363         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1364         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1365       }
 1366       // now we need to copy any remaining part block which may
 1367       // include a 4 word block subblock and/or a 2 word subblock.
 1368       // bits 2 and 1 in the count are the tell-tale for whether we
 1369       // have each such subblock
 1370       {
 1371         Label L1, L2;
 1372         __ tbz(count, exact_log2(4), L1);
 1373         // this is the same as above but copying only 4 longs hence
 1374         // with only one intervening stp between the str instructions
 1375         // but note that the offsets and registers still follow the
 1376         // same pattern
 1377         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1378         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1379         if (direction == copy_forwards) {
 1380           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1381           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1382           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1383         } else {
 1384           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1385           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1386           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1387         }
 1388         __ bind(L1);
 1389 
 1390         __ tbz(count, 1, L2);
 1391         // this is the same as above but copying only 2 longs hence
 1392         // there is no intervening stp between the str instructions
 1393         // but note that the offset and register patterns are still
 1394         // the same
 1395         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1396         if (direction == copy_forwards) {
 1397           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1398           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1399         } else {
 1400           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1401           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1402         }
 1403         __ bind(L2);
 1404 
 1405         // for forwards copy we need to re-adjust the offsets we
 1406         // applied so that s and d are follow the last words written
 1407 
 1408         if (direction == copy_forwards) {
 1409           __ add(s, s, 16);
 1410           __ add(d, d, 8);
 1411         }
 1412 
 1413       }
 1414 
 1415       __ ret(lr);
 1416     }
 1417 
 1418     // record the stub entry and end
 1419     store_archive_data(stub_id, start, __ pc());
 1420 
 1421     return start;
 1422   }
 1423 
 1424   // Small copy: less than 16 bytes.
 1425   //
 1426   // NB: Ignores all of the bits of count which represent more than 15
 1427   // bytes, so a caller doesn't have to mask them.
 1428 
 1429   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1430     bool is_backwards = step < 0;
 1431     size_t granularity = g_uabs(step);
 1432     int direction = is_backwards ? -1 : 1;
 1433 
 1434     Label Lword, Lint, Lshort, Lbyte;
 1435 
 1436     assert(granularity
 1437            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1438 
 1439     const Register t0 = r3;
 1440     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1441     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1442 
 1443     // ??? I don't know if this bit-test-and-branch is the right thing
 1444     // to do.  It does a lot of jumping, resulting in several
 1445     // mispredicted branches.  It might make more sense to do this
 1446     // with something like Duff's device with a single computed branch.
 1447 
 1448     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1449     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1450     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1451     __ bind(Lword);
 1452 
 1453     if (granularity <= sizeof (jint)) {
 1454       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1455       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1456       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1457       __ bind(Lint);
 1458     }
 1459 
 1460     if (granularity <= sizeof (jshort)) {
 1461       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1462       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1463       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1464       __ bind(Lshort);
 1465     }
 1466 
 1467     if (granularity <= sizeof (jbyte)) {
 1468       __ tbz(count, 0, Lbyte);
 1469       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1470       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1471       __ bind(Lbyte);
 1472     }
 1473   }
 1474 
 1475   // All-singing all-dancing memory copy.
 1476   //
 1477   // Copy count units of memory from s to d.  The size of a unit is
 1478   // step, which can be positive or negative depending on the direction
 1479   // of copy.  If is_aligned is false, we align the source address.
 1480   //
 1481 
 1482   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1483                    Register s, Register d, Register count, int step) {
 1484     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1485     bool is_backwards = step < 0;
 1486     unsigned int granularity = g_uabs(step);
 1487     const Register t0 = r3, t1 = r4;
 1488 
 1489     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1490     // load all the data before writing anything
 1491     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1492     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1493     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1494     const Register send = r17, dend = r16;
 1495     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1496     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1497     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1498 
 1499     if (PrefetchCopyIntervalInBytes > 0)
 1500       __ prfm(Address(s, 0), PLDL1KEEP);
 1501     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1502     __ br(Assembler::HI, copy_big);
 1503 
 1504     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1505     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1506 
 1507     __ cmp(count, u1(16/granularity));
 1508     __ br(Assembler::LS, copy16);
 1509 
 1510     __ cmp(count, u1(64/granularity));
 1511     __ br(Assembler::HI, copy80);
 1512 
 1513     __ cmp(count, u1(32/granularity));
 1514     __ br(Assembler::LS, copy32);
 1515 
 1516     // 33..64 bytes
 1517     if (UseSIMDForMemoryOps) {
 1518       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1519       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1520       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1521       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1522     } else {
 1523       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1524       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1525       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1526       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1527 
 1528       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1529       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1530       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1531       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1532     }
 1533     __ b(finish);
 1534 
 1535     // 17..32 bytes
 1536     __ bind(copy32);
 1537     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1538     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1539 
 1540     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1541     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1542     __ b(finish);
 1543 
 1544     // 65..80/96 bytes
 1545     // (96 bytes if SIMD because we do 32 byes per instruction)
 1546     __ bind(copy80);
 1547     if (UseSIMDForMemoryOps) {
 1548       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1549       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1550       // Unaligned pointers can be an issue for copying.
 1551       // The issue has more chances to happen when granularity of data is
 1552       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1553       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1554       // The most performance drop has been seen for the range 65-80 bytes.
 1555       // For such cases using the pair of ldp/stp instead of the third pair of
 1556       // ldpq/stpq fixes the performance issue.
 1557       if (granularity < sizeof (jint)) {
 1558         Label copy96;
 1559         __ cmp(count, u1(80/granularity));
 1560         __ br(Assembler::HI, copy96);
 1561         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1562 
 1563         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1564         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1565 
 1566         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1567         __ b(finish);
 1568 
 1569         __ bind(copy96);
 1570       }
 1571       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1572 
 1573       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1574       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1575 
 1576       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1577     } else {
 1578       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1579       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1580       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1581       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1582       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1583 
 1584       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1585       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1586       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1587       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1588       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1589     }
 1590     __ b(finish);
 1591 
 1592     // 0..16 bytes
 1593     __ bind(copy16);
 1594     __ cmp(count, u1(8/granularity));
 1595     __ br(Assembler::LO, copy8);
 1596 
 1597     // 8..16 bytes
 1598     bs.copy_load_at_8(t0, Address(s, 0));
 1599     bs.copy_load_at_8(t1, Address(send, -8));
 1600     bs.copy_store_at_8(Address(d, 0), t0);
 1601     bs.copy_store_at_8(Address(dend, -8), t1);
 1602     __ b(finish);
 1603 
 1604     if (granularity < 8) {
 1605       // 4..7 bytes
 1606       __ bind(copy8);
 1607       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1608       __ ldrw(t0, Address(s, 0));
 1609       __ ldrw(t1, Address(send, -4));
 1610       __ strw(t0, Address(d, 0));
 1611       __ strw(t1, Address(dend, -4));
 1612       __ b(finish);
 1613       if (granularity < 4) {
 1614         // 0..3 bytes
 1615         __ bind(copy4);
 1616         __ cbz(count, finish); // get rid of 0 case
 1617         if (granularity == 2) {
 1618           __ ldrh(t0, Address(s, 0));
 1619           __ strh(t0, Address(d, 0));
 1620         } else { // granularity == 1
 1621           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1622           // the first and last byte.
 1623           // Handle the 3 byte case by loading and storing base + count/2
 1624           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1625           // This does means in the 1 byte case we load/store the same
 1626           // byte 3 times.
 1627           __ lsr(count, count, 1);
 1628           __ ldrb(t0, Address(s, 0));
 1629           __ ldrb(t1, Address(send, -1));
 1630           __ ldrb(t2, Address(s, count));
 1631           __ strb(t0, Address(d, 0));
 1632           __ strb(t1, Address(dend, -1));
 1633           __ strb(t2, Address(d, count));
 1634         }
 1635         __ b(finish);
 1636       }
 1637     }
 1638 
 1639     __ bind(copy_big);
 1640     if (is_backwards) {
 1641       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1642       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1643     }
 1644 
 1645     // Now we've got the small case out of the way we can align the
 1646     // source address on a 2-word boundary.
 1647 
 1648     // Here we will materialize a count in r15, which is used by copy_memory_small
 1649     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1650     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1651     // can not be used as a temp register, as it contains the count.
 1652 
 1653     Label aligned;
 1654 
 1655     if (is_aligned) {
 1656       // We may have to adjust by 1 word to get s 2-word-aligned.
 1657       __ tbz(s, exact_log2(wordSize), aligned);
 1658       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1659       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1660       __ sub(count, count, wordSize/granularity);
 1661     } else {
 1662       if (is_backwards) {
 1663         __ andr(r15, s, 2 * wordSize - 1);
 1664       } else {
 1665         __ neg(r15, s);
 1666         __ andr(r15, r15, 2 * wordSize - 1);
 1667       }
 1668       // r15 is the byte adjustment needed to align s.
 1669       __ cbz(r15, aligned);
 1670       int shift = exact_log2(granularity);
 1671       if (shift > 0) {
 1672         __ lsr(r15, r15, shift);
 1673       }
 1674       __ sub(count, count, r15);
 1675 
 1676 #if 0
 1677       // ?? This code is only correct for a disjoint copy.  It may or
 1678       // may not make sense to use it in that case.
 1679 
 1680       // Copy the first pair; s and d may not be aligned.
 1681       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1682       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1683 
 1684       // Align s and d, adjust count
 1685       if (is_backwards) {
 1686         __ sub(s, s, r15);
 1687         __ sub(d, d, r15);
 1688       } else {
 1689         __ add(s, s, r15);
 1690         __ add(d, d, r15);
 1691       }
 1692 #else
 1693       copy_memory_small(decorators, type, s, d, r15, step);
 1694 #endif
 1695     }
 1696 
 1697     __ bind(aligned);
 1698 
 1699     // s is now 2-word-aligned.
 1700 
 1701     // We have a count of units and some trailing bytes. Adjust the
 1702     // count and do a bulk copy of words. If the shift is zero
 1703     // perform a move instead to benefit from zero latency moves.
 1704     int shift = exact_log2(wordSize/granularity);
 1705     if (shift > 0) {
 1706       __ lsr(r15, count, shift);
 1707     } else {
 1708       __ mov(r15, count);
 1709     }
 1710     if (direction == copy_forwards) {
 1711       if (type != T_OBJECT) {
 1712         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
 1713         __ blr(rscratch1);
 1714       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1715         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
 1716         __ blr(rscratch1);
 1717       } else {
 1718         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
 1719         __ blr(rscratch1);
 1720       }
 1721     } else {
 1722       if (type != T_OBJECT) {
 1723         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
 1724         __ blr(rscratch1);
 1725       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1726         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
 1727         __ blr(rscratch1);
 1728       } else {
 1729         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
 1730         __ blr(rscratch1);
 1731       }
 1732     }
 1733 
 1734     // And the tail.
 1735     copy_memory_small(decorators, type, s, d, count, step);
 1736 
 1737     if (granularity >= 8) __ bind(copy8);
 1738     if (granularity >= 4) __ bind(copy4);
 1739     __ bind(finish);
 1740   }
 1741 
 1742 
 1743   void clobber_registers() {
 1744 #ifdef ASSERT
 1745     RegSet clobbered
 1746       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1747     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1748     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1749     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1750       __ mov(*it, rscratch1);
 1751     }
 1752 #endif
 1753 
 1754   }
 1755 
 1756   // Scan over array at a for count oops, verifying each one.
 1757   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1758   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1759     Label loop, end;
 1760     __ mov(rscratch1, a);
 1761     __ mov(rscratch2, zr);
 1762     __ bind(loop);
 1763     __ cmp(rscratch2, count);
 1764     __ br(Assembler::HS, end);
 1765     if (size == wordSize) {
 1766       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1767       __ verify_oop(temp);
 1768     } else {
 1769       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1770       __ decode_heap_oop(temp); // calls verify_oop
 1771     }
 1772     __ add(rscratch2, rscratch2, 1);
 1773     __ b(loop);
 1774     __ bind(end);
 1775   }
 1776 
 1777   // Arguments:
 1778   //   stub_id - is used to name the stub and identify all details of
 1779   //             how to perform the copy.
 1780   //
 1781   //   nopush_entry - is assigned to the stub's post push entry point
 1782   //                  unless it is null
 1783   //
 1784   // Inputs:
 1785   //   c_rarg0   - source array address
 1786   //   c_rarg1   - destination array address
 1787   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1788   //
 1789   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1790   // the hardware handle it.  The two dwords within qwords that span
 1791   // cache line boundaries will still be loaded and stored atomically.
 1792   //
 1793   // Side Effects: nopush_entry is set to the (post push) entry point
 1794   //               so it can be used by the corresponding conjoint
 1795   //               copy method
 1796   //
 1797   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1798     int size;
 1799     bool aligned;
 1800     bool is_oop;
 1801     bool dest_uninitialized;
 1802     switch (stub_id) {
 1803     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1804       size = sizeof(jbyte);
 1805       aligned = false;
 1806       is_oop = false;
 1807       dest_uninitialized = false;
 1808       break;
 1809     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1810       size = sizeof(jbyte);
 1811       aligned = true;
 1812       is_oop = false;
 1813       dest_uninitialized = false;
 1814       break;
 1815     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1816       size = sizeof(jshort);
 1817       aligned = false;
 1818       is_oop = false;
 1819       dest_uninitialized = false;
 1820       break;
 1821     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1822       size = sizeof(jshort);
 1823       aligned = true;
 1824       is_oop = false;
 1825       dest_uninitialized = false;
 1826       break;
 1827     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1828       size = sizeof(jint);
 1829       aligned = false;
 1830       is_oop = false;
 1831       dest_uninitialized = false;
 1832       break;
 1833     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1834       size = sizeof(jint);
 1835       aligned = true;
 1836       is_oop = false;
 1837       dest_uninitialized = false;
 1838       break;
 1839     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1840       // since this is always aligned we can (should!) use the same
 1841       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1842       ShouldNotReachHere();
 1843       break;
 1844     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1845       size = sizeof(jlong);
 1846       aligned = true;
 1847       is_oop = false;
 1848       dest_uninitialized = false;
 1849       break;
 1850     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1851       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1852       aligned = !UseCompressedOops;
 1853       is_oop = true;
 1854       dest_uninitialized = false;
 1855       break;
 1856     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1857       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1858       aligned = !UseCompressedOops;
 1859       is_oop = true;
 1860       dest_uninitialized = false;
 1861       break;
 1862     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1863       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1864       aligned = !UseCompressedOops;
 1865       is_oop = true;
 1866       dest_uninitialized = true;
 1867       break;
 1868     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1869       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1870       aligned = !UseCompressedOops;
 1871       is_oop = true;
 1872       dest_uninitialized = true;
 1873       break;
 1874     default:
 1875       ShouldNotReachHere();
 1876       break;
 1877     }
 1878     // all stubs provide a 2nd entry which omits the frame push for
 1879     // use when bailing out from a conjoint copy. However we may also
 1880     // need some extra addressses for memory access protection.
 1881     int entry_count = StubInfo::entry_count(stub_id);
 1882     assert(entry_count == 2, "sanity check");
 1883     assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
 1884 
 1885     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 1886     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 1887     GrowableArray<address> entries;
 1888     GrowableArray<address> extras;
 1889     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 1890     address start = load_archive_data(stub_id, &entries, extras_ptr);
 1891     if (start != nullptr) {
 1892       assert(entries.length() == entry_count - 1,
 1893              "unexpected entries count %d", entries.length());
 1894       *nopush_entry = entries.at(0);
 1895       assert(extras.length() == extra_count,
 1896              "unexpected extra count %d", extras.length());
 1897       if (add_extras) {
 1898         // register one handler at offset 0
 1899         register_unsafe_access_handlers(extras, 0, 1);
 1900       }
 1901       return start;
 1902     }
 1903 
 1904     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1905     RegSet saved_reg = RegSet::of(s, d, count);
 1906 
 1907     __ align(CodeEntryAlignment);
 1908     StubCodeMark mark(this, stub_id);
 1909     start = __ pc();
 1910     __ enter();
 1911 
 1912     *nopush_entry = __ pc();
 1913     entries.append(*nopush_entry);
 1914 
 1915     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1916     BLOCK_COMMENT("Post-Push Entry:");
 1917 
 1918     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1919     if (dest_uninitialized) {
 1920       decorators |= IS_DEST_UNINITIALIZED;
 1921     }
 1922     if (aligned) {
 1923       decorators |= ARRAYCOPY_ALIGNED;
 1924     }
 1925 
 1926     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1927     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1928 
 1929     if (is_oop) {
 1930       // save regs before copy_memory
 1931       __ push(RegSet::of(d, count), sp);
 1932     }
 1933     {
 1934       // UnsafeMemoryAccess page error: continue after unsafe access
 1935       UnsafeMemoryAccessMark umam(this, add_extras, true);
 1936       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1937     }
 1938 
 1939     if (is_oop) {
 1940       __ pop(RegSet::of(d, count), sp);
 1941       if (VerifyOops)
 1942         verify_oop_array(size, d, count, r16);
 1943     }
 1944 
 1945     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1946 
 1947     __ leave();
 1948     __ mov(r0, zr); // return 0
 1949     __ ret(lr);
 1950 
 1951     address end = __ pc();
 1952 
 1953     if (add_extras) {
 1954       // retrieve the registered handler addresses
 1955       retrieve_unsafe_access_handlers(start, end, extras);
 1956       assert(extras.length() == extra_count
 1957              , "incorrect handlers count %d", extras.length());
 1958     }
 1959 
 1960     // record the stub entry and end plus the no_push entry and any
 1961     // extra handler addresses
 1962     store_archive_data(stub_id, start, end, &entries, extras_ptr);
 1963 
 1964     return start;
 1965   }
 1966 
 1967   // Arguments:
 1968   //   stub_id - is used to name the stub and identify all details of
 1969   //             how to perform the copy.
 1970   //
 1971   //   nooverlap_target - identifes the (post push) entry for the
 1972   //             corresponding disjoint copy routine which can be
 1973   //             jumped to if the ranges do not actually overlap
 1974   //
 1975   //   nopush_entry - is assigned to the stub's post push entry point
 1976   //                  unless it is null
 1977   //
 1978   //
 1979   // Inputs:
 1980   //   c_rarg0   - source array address
 1981   //   c_rarg1   - destination array address
 1982   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1983   //
 1984   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1985   // the hardware handle it.  The two dwords within qwords that span
 1986   // cache line boundaries will still be loaded and stored atomically.
 1987   //
 1988   // Side Effects:
 1989   //   nopush_entry is set to the no-overlap entry point so it can be
 1990   //   used by some other conjoint copy method
 1991   //
 1992   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1993     int size;
 1994     bool aligned;
 1995     bool is_oop;
 1996     bool dest_uninitialized;
 1997     switch (stub_id) {
 1998     case StubId::stubgen_jbyte_arraycopy_id:
 1999       size = sizeof(jbyte);
 2000       aligned = false;
 2001       is_oop = false;
 2002       dest_uninitialized = false;
 2003       break;
 2004     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 2005       size = sizeof(jbyte);
 2006       aligned = true;
 2007       is_oop = false;
 2008       dest_uninitialized = false;
 2009       break;
 2010     case StubId::stubgen_jshort_arraycopy_id:
 2011       size = sizeof(jshort);
 2012       aligned = false;
 2013       is_oop = false;
 2014       dest_uninitialized = false;
 2015       break;
 2016     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 2017       size = sizeof(jshort);
 2018       aligned = true;
 2019       is_oop = false;
 2020       dest_uninitialized = false;
 2021       break;
 2022     case StubId::stubgen_jint_arraycopy_id:
 2023       size = sizeof(jint);
 2024       aligned = false;
 2025       is_oop = false;
 2026       dest_uninitialized = false;
 2027       break;
 2028     case StubId::stubgen_arrayof_jint_arraycopy_id:
 2029       size = sizeof(jint);
 2030       aligned = true;
 2031       is_oop = false;
 2032       dest_uninitialized = false;
 2033       break;
 2034     case StubId::stubgen_jlong_arraycopy_id:
 2035       // since this is always aligned we can (should!) use the same
 2036       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 2037       ShouldNotReachHere();
 2038       break;
 2039     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 2040       size = sizeof(jlong);
 2041       aligned = true;
 2042       is_oop = false;
 2043       dest_uninitialized = false;
 2044       break;
 2045     case StubId::stubgen_oop_arraycopy_id:
 2046       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2047       aligned = !UseCompressedOops;
 2048       is_oop = true;
 2049       dest_uninitialized = false;
 2050       break;
 2051     case StubId::stubgen_arrayof_oop_arraycopy_id:
 2052       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2053       aligned = !UseCompressedOops;
 2054       is_oop = true;
 2055       dest_uninitialized = false;
 2056       break;
 2057     case StubId::stubgen_oop_arraycopy_uninit_id:
 2058       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2059       aligned = !UseCompressedOops;
 2060       is_oop = true;
 2061       dest_uninitialized = true;
 2062       break;
 2063     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 2064       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2065       aligned = !UseCompressedOops;
 2066       is_oop = true;
 2067       dest_uninitialized = true;
 2068       break;
 2069     default:
 2070       ShouldNotReachHere();
 2071     }
 2072     // only some conjoint stubs generate a 2nd entry
 2073     int entry_count = StubInfo::entry_count(stub_id);
 2074     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2075     assert(entry_count == expected_entry_count,
 2076            "expected entry count %d does not match declared entry count %d for stub %s",
 2077            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2078 
 2079     // We need to protect memory accesses in certain cases
 2080     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 2081     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 2082     GrowableArray<address> entries;
 2083     GrowableArray<address> extras;
 2084     GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
 2085     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 2086     address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 2087     if (start != nullptr) {
 2088       assert(entries.length() == expected_entry_count - 1,
 2089              "unexpected entries count %d", entries.length());
 2090       assert(extras.length() == extra_count,
 2091              "unexpected extra count %d", extras.length());
 2092       if (nopush_entry != nullptr) {
 2093         *nopush_entry = entries.at(0);
 2094       }
 2095       if (add_extras) {
 2096         // register one handler at offset 0
 2097         register_unsafe_access_handlers(extras, 0, 1);
 2098       }
 2099       return start;
 2100     }
 2101 
 2102     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2103     RegSet saved_regs = RegSet::of(s, d, count);
 2104     StubCodeMark mark(this, stub_id);
 2105     start = __ pc();
 2106     __ enter();
 2107 
 2108     if (nopush_entry != nullptr) {
 2109       *nopush_entry = __ pc();
 2110       entries.append(*nopush_entry);
 2111       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 2112       BLOCK_COMMENT("Post-Push Entry:");
 2113     }
 2114 
 2115     // use fwd copy when (d-s) above_equal (count*size)
 2116     Label L_overlapping;
 2117     __ sub(rscratch1, d, s);
 2118     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 2119     __ br(Assembler::LO, L_overlapping);
 2120     __ b(RuntimeAddress(nooverlap_target));
 2121     __ bind(L_overlapping);
 2122 
 2123     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 2124     if (dest_uninitialized) {
 2125       decorators |= IS_DEST_UNINITIALIZED;
 2126     }
 2127     if (aligned) {
 2128       decorators |= ARRAYCOPY_ALIGNED;
 2129     }
 2130 
 2131     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2132     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 2133 
 2134     if (is_oop) {
 2135       // save regs before copy_memory
 2136       __ push(RegSet::of(d, count), sp);
 2137     }
 2138     {
 2139       // UnsafeMemoryAccess page error: continue after unsafe access
 2140       UnsafeMemoryAccessMark umam(this, add_extras, true);
 2141       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 2142     }
 2143     if (is_oop) {
 2144       __ pop(RegSet::of(d, count), sp);
 2145       if (VerifyOops)
 2146         verify_oop_array(size, d, count, r16);
 2147     }
 2148     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 2149     __ leave();
 2150     __ mov(r0, zr); // return 0
 2151     __ ret(lr);
 2152 
 2153     assert(entries.length() == expected_entry_count - 1,
 2154            "unexpected entries count %d", entries.length());
 2155 
 2156     address end = __ pc();
 2157 
 2158     if (add_extras) {
 2159       // retrieve the registered handler addresses
 2160       retrieve_unsafe_access_handlers(start, end, extras);
 2161       assert(extras.length() == extra_count,
 2162              "incorrect handlers count %d", extras.length());
 2163     }
 2164 
 2165     // record the stub entry and end plus any no_push entry and/or
 2166     // extra handler addresses
 2167     store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
 2168 
 2169     return start;
 2170   }
 2171 
 2172   // Helper for generating a dynamic type check.
 2173   // Smashes rscratch1, rscratch2.
 2174   void generate_type_check(Register sub_klass,
 2175                            Register super_check_offset,
 2176                            Register super_klass,
 2177                            Register temp1,
 2178                            Register temp2,
 2179                            Register result,
 2180                            Label& L_success) {
 2181     assert_different_registers(sub_klass, super_check_offset, super_klass);
 2182 
 2183     BLOCK_COMMENT("type_check:");
 2184 
 2185     Label L_miss;
 2186 
 2187     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 2188                                      super_check_offset);
 2189     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 2190 
 2191     // Fall through on failure!
 2192     __ BIND(L_miss);
 2193   }
 2194 
 2195   //
 2196   //  Generate checkcasting array copy stub
 2197   //
 2198   //  Input:
 2199   //    c_rarg0   - source array address
 2200   //    c_rarg1   - destination array address
 2201   //    c_rarg2   - element count, treated as ssize_t, can be zero
 2202   //    c_rarg3   - size_t ckoff (super_check_offset)
 2203   //    c_rarg4   - oop ckval (super_klass)
 2204   //
 2205   //  Output:
 2206   //    r0 ==  0  -  success
 2207   //    r0 == -1^K - failure, where K is partial transfer count
 2208   //
 2209   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 2210     bool dest_uninitialized;
 2211     switch (stub_id) {
 2212     case StubId::stubgen_checkcast_arraycopy_id:
 2213       dest_uninitialized = false;
 2214       break;
 2215     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 2216       dest_uninitialized = true;
 2217       break;
 2218     default:
 2219       ShouldNotReachHere();
 2220     }
 2221 
 2222     // The normal stub provides a 2nd entry which omits the frame push
 2223     // for use when bailing out from a disjoint copy.
 2224     // Only some conjoint stubs generate a 2nd entry
 2225     int entry_count = StubInfo::entry_count(stub_id);
 2226     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2227     GrowableArray<address> entries;
 2228     GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
 2229     assert(entry_count == expected_entry_count,
 2230            "expected entry count %d does not match declared entry count %d for stub %s",
 2231            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2232     address start = load_archive_data(stub_id, entries_ptr);
 2233     if (start != nullptr) {
 2234       assert(entries.length() + 1 == expected_entry_count,
 2235              "expected entry count %d does not match return entry count %d for stub %s",
 2236              expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
 2237       if (nopush_entry != nullptr) {
 2238         *nopush_entry = entries.at(0);
 2239       }
 2240       return start;
 2241     }
 2242 
 2243     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 2244 
 2245     // Input registers (after setup_arg_regs)
 2246     const Register from        = c_rarg0;   // source array address
 2247     const Register to          = c_rarg1;   // destination array address
 2248     const Register count       = c_rarg2;   // elementscount
 2249     const Register ckoff       = c_rarg3;   // super_check_offset
 2250     const Register ckval       = c_rarg4;   // super_klass
 2251 
 2252     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 2253 
 2254     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 2255     const Register copied_oop  = r22;       // actual oop copied
 2256     const Register count_save  = r21;       // orig elementscount
 2257     const Register start_to    = r20;       // destination array start address
 2258     const Register r19_klass   = r19;       // oop._klass
 2259 
 2260     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 2261     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 2262 
 2263     //---------------------------------------------------------------
 2264     // Assembler stub will be used for this call to arraycopy
 2265     // if the two arrays are subtypes of Object[] but the
 2266     // destination array type is not equal to or a supertype
 2267     // of the source type.  Each element must be separately
 2268     // checked.
 2269 
 2270     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 2271                                copied_oop, r19_klass, count_save);
 2272 
 2273     __ align(CodeEntryAlignment);
 2274     StubCodeMark mark(this, stub_id);
 2275     start = __ pc();
 2276 
 2277     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2278 
 2279 #ifdef ASSERT
 2280     // caller guarantees that the arrays really are different
 2281     // otherwise, we would have to make conjoint checks
 2282     { Label L;
 2283       __ b(L);                  // conjoint check not yet implemented
 2284       __ stop("checkcast_copy within a single array");
 2285       __ bind(L);
 2286     }
 2287 #endif //ASSERT
 2288 
 2289     // Caller of this entry point must set up the argument registers.
 2290     if (nopush_entry != nullptr) {
 2291       *nopush_entry = __ pc();
 2292       entries.append(*nopush_entry);
 2293       BLOCK_COMMENT("Entry:");
 2294     }
 2295 
 2296      // Empty array:  Nothing to do.
 2297     __ cbz(count, L_done);
 2298     __ push(RegSet::of(r19, r20, r21, r22), sp);
 2299 
 2300 #ifdef ASSERT
 2301     BLOCK_COMMENT("assert consistent ckoff/ckval");
 2302     // The ckoff and ckval must be mutually consistent,
 2303     // even though caller generates both.
 2304     { Label L;
 2305       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2306       __ ldrw(start_to, Address(ckval, sco_offset));
 2307       __ cmpw(ckoff, start_to);
 2308       __ br(Assembler::EQ, L);
 2309       __ stop("super_check_offset inconsistent");
 2310       __ bind(L);
 2311     }
 2312 #endif //ASSERT
 2313 
 2314     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 2315     bool is_oop = true;
 2316     int element_size = UseCompressedOops ? 4 : 8;
 2317     if (dest_uninitialized) {
 2318       decorators |= IS_DEST_UNINITIALIZED;
 2319     }
 2320 
 2321     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2322     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 2323 
 2324     // save the original count
 2325     __ mov(count_save, count);
 2326 
 2327     // Copy from low to high addresses
 2328     __ mov(start_to, to);              // Save destination array start address
 2329     __ b(L_load_element);
 2330 
 2331     // ======== begin loop ========
 2332     // (Loop is rotated; its entry is L_load_element.)
 2333     // Loop control:
 2334     //   for (; count != 0; count--) {
 2335     //     copied_oop = load_heap_oop(from++);
 2336     //     ... generate_type_check ...;
 2337     //     store_heap_oop(to++, copied_oop);
 2338     //   }
 2339     __ align(OptoLoopAlignment);
 2340 
 2341     __ BIND(L_store_element);
 2342     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 2343                       __ post(to, element_size), copied_oop, noreg,
 2344                       gct1, gct2, gct3);
 2345     __ sub(count, count, 1);
 2346     __ cbz(count, L_do_card_marks);
 2347 
 2348     // ======== loop entry is here ========
 2349     __ BIND(L_load_element);
 2350     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 2351                      copied_oop, noreg, __ post(from, element_size),
 2352                      gct1);
 2353     __ cbz(copied_oop, L_store_element);
 2354 
 2355     __ load_klass(r19_klass, copied_oop);// query the object klass
 2356 
 2357     BLOCK_COMMENT("type_check:");
 2358     generate_type_check(/*sub_klass*/r19_klass,
 2359                         /*super_check_offset*/ckoff,
 2360                         /*super_klass*/ckval,
 2361                         /*r_array_base*/gct1,
 2362                         /*temp2*/gct2,
 2363                         /*result*/r10, L_store_element);
 2364 
 2365     // Fall through on failure!
 2366 
 2367     // ======== end loop ========
 2368 
 2369     // It was a real error; we must depend on the caller to finish the job.
 2370     // Register count = remaining oops, count_orig = total oops.
 2371     // Emit GC store barriers for the oops we have copied and report
 2372     // their number to the caller.
 2373 
 2374     __ subs(count, count_save, count);     // K = partially copied oop count
 2375     __ eon(count, count, zr);              // report (-1^K) to caller
 2376     __ br(Assembler::EQ, L_done_pop);
 2377 
 2378     __ BIND(L_do_card_marks);
 2379     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2380 
 2381     __ bind(L_done_pop);
 2382     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2383     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2384 
 2385     __ bind(L_done);
 2386     __ mov(r0, count);
 2387     __ leave();
 2388     __ ret(lr);
 2389 
 2390     // record the stub entry and end plus any no_push entry
 2391     store_archive_data(stub_id, start, __ pc() , entries_ptr);
 2392     return start;
 2393   }
 2394 
 2395   // Perform range checks on the proposed arraycopy.
 2396   // Kills temp, but nothing else.
 2397   // Also, clean the sign bits of src_pos and dst_pos.
 2398   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2399                               Register src_pos, // source position (c_rarg1)
 2400                               Register dst,     // destination array oo (c_rarg2)
 2401                               Register dst_pos, // destination position (c_rarg3)
 2402                               Register length,
 2403                               Register temp,
 2404                               Label& L_failed) {
 2405     BLOCK_COMMENT("arraycopy_range_checks:");
 2406 
 2407     assert_different_registers(rscratch1, temp);
 2408 
 2409     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2410     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2411     __ addw(temp, length, src_pos);
 2412     __ cmpw(temp, rscratch1);
 2413     __ br(Assembler::HI, L_failed);
 2414 
 2415     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2416     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2417     __ addw(temp, length, dst_pos);
 2418     __ cmpw(temp, rscratch1);
 2419     __ br(Assembler::HI, L_failed);
 2420 
 2421     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2422     __ movw(src_pos, src_pos);
 2423     __ movw(dst_pos, dst_pos);
 2424 
 2425     BLOCK_COMMENT("arraycopy_range_checks done");
 2426   }
 2427 
 2428   // These stubs get called from some dumb test routine.
 2429   // I'll write them properly when they're called from
 2430   // something that's actually doing something.
 2431   static void fake_arraycopy_stub(address src, address dst, int count) {
 2432     assert(count == 0, "huh?");
 2433   }
 2434 
 2435 
 2436   //
 2437   //  Generate 'unsafe' array copy stub
 2438   //  Though just as safe as the other stubs, it takes an unscaled
 2439   //  size_t argument instead of an element count.
 2440   //
 2441   //  Input:
 2442   //    c_rarg0   - source array address
 2443   //    c_rarg1   - destination array address
 2444   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2445   //
 2446   // Examines the alignment of the operands and dispatches
 2447   // to a long, int, short, or byte copy loop.
 2448   //
 2449   address generate_unsafe_copy(address byte_copy_entry,
 2450                                address short_copy_entry,
 2451                                address int_copy_entry,
 2452                                address long_copy_entry) {
 2453     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2454     int entry_count = StubInfo::entry_count(stub_id);
 2455     assert(entry_count == 1, "sanity check");
 2456     address start = load_archive_data(stub_id);
 2457     if (start != nullptr) {
 2458       return start;
 2459     }
 2460     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2461     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2462 
 2463     __ align(CodeEntryAlignment);
 2464     StubCodeMark mark(this, stub_id);
 2465     start = __ pc();
 2466     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2467 
 2468     // bump this on entry, not on exit:
 2469     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2470 
 2471     __ orr(rscratch1, s, d);
 2472     __ orr(rscratch1, rscratch1, count);
 2473 
 2474     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2475     __ cbz(rscratch1, L_long_aligned);
 2476     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2477     __ cbz(rscratch1, L_int_aligned);
 2478     __ tbz(rscratch1, 0, L_short_aligned);
 2479     __ b(RuntimeAddress(byte_copy_entry));
 2480 
 2481     __ BIND(L_short_aligned);
 2482     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2483     __ b(RuntimeAddress(short_copy_entry));
 2484     __ BIND(L_int_aligned);
 2485     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2486     __ b(RuntimeAddress(int_copy_entry));
 2487     __ BIND(L_long_aligned);
 2488     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2489     __ b(RuntimeAddress(long_copy_entry));
 2490 
 2491     // record the stub entry and end
 2492     store_archive_data(stub_id, start, __ pc());
 2493 
 2494     return start;
 2495   }
 2496 
 2497   //
 2498   //  Generate generic array copy stubs
 2499   //
 2500   //  Input:
 2501   //    c_rarg0    -  src oop
 2502   //    c_rarg1    -  src_pos (32-bits)
 2503   //    c_rarg2    -  dst oop
 2504   //    c_rarg3    -  dst_pos (32-bits)
 2505   //    c_rarg4    -  element count (32-bits)
 2506   //
 2507   //  Output:
 2508   //    r0 ==  0  -  success
 2509   //    r0 == -1^K - failure, where K is partial transfer count
 2510   //
 2511   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2512                                 address int_copy_entry, address oop_copy_entry,
 2513                                 address long_copy_entry, address checkcast_copy_entry) {
 2514     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2515     int entry_count = StubInfo::entry_count(stub_id);
 2516     assert(entry_count == 1, "sanity check");
 2517     address start = load_archive_data(stub_id);
 2518     if (start != nullptr) {
 2519       return start;
 2520     }
 2521     Label L_failed, L_objArray;
 2522     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2523 
 2524     // Input registers
 2525     const Register src        = c_rarg0;  // source array oop
 2526     const Register src_pos    = c_rarg1;  // source position
 2527     const Register dst        = c_rarg2;  // destination array oop
 2528     const Register dst_pos    = c_rarg3;  // destination position
 2529     const Register length     = c_rarg4;
 2530 
 2531 
 2532     // Registers used as temps
 2533     const Register dst_klass  = c_rarg5;
 2534 
 2535     __ align(CodeEntryAlignment);
 2536 
 2537     StubCodeMark mark(this, stub_id);
 2538 
 2539     start = __ pc();
 2540 
 2541     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2542 
 2543     // bump this on entry, not on exit:
 2544     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2545 
 2546     //-----------------------------------------------------------------------
 2547     // Assembler stub will be used for this call to arraycopy
 2548     // if the following conditions are met:
 2549     //
 2550     // (1) src and dst must not be null.
 2551     // (2) src_pos must not be negative.
 2552     // (3) dst_pos must not be negative.
 2553     // (4) length  must not be negative.
 2554     // (5) src klass and dst klass should be the same and not null.
 2555     // (6) src and dst should be arrays.
 2556     // (7) src_pos + length must not exceed length of src.
 2557     // (8) dst_pos + length must not exceed length of dst.
 2558     //
 2559 
 2560     //  if (src == nullptr) return -1;
 2561     __ cbz(src, L_failed);
 2562 
 2563     //  if (src_pos < 0) return -1;
 2564     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2565 
 2566     //  if (dst == nullptr) return -1;
 2567     __ cbz(dst, L_failed);
 2568 
 2569     //  if (dst_pos < 0) return -1;
 2570     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2571 
 2572     // registers used as temp
 2573     const Register scratch_length    = r16; // elements count to copy
 2574     const Register scratch_src_klass = r17; // array klass
 2575     const Register lh                = r15; // layout helper
 2576 
 2577     //  if (length < 0) return -1;
 2578     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2579     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2580 
 2581     __ load_klass(scratch_src_klass, src);
 2582 #ifdef ASSERT
 2583     //  assert(src->klass() != nullptr);
 2584     {
 2585       BLOCK_COMMENT("assert klasses not null {");
 2586       Label L1, L2;
 2587       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2588       __ bind(L1);
 2589       __ stop("broken null klass");
 2590       __ bind(L2);
 2591       __ load_klass(rscratch1, dst);
 2592       __ cbz(rscratch1, L1);     // this would be broken also
 2593       BLOCK_COMMENT("} assert klasses not null done");
 2594     }
 2595 #endif
 2596 
 2597     // Load layout helper (32-bits)
 2598     //
 2599     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2600     // 32        30    24            16              8     2                 0
 2601     //
 2602     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2603     //
 2604 
 2605     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2606 
 2607     // Handle objArrays completely differently...
 2608     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2609     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2610     __ movw(rscratch1, objArray_lh);
 2611     __ eorw(rscratch2, lh, rscratch1);
 2612     __ cbzw(rscratch2, L_objArray);
 2613 
 2614     //  if (src->klass() != dst->klass()) return -1;
 2615     __ load_klass(rscratch2, dst);
 2616     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2617     __ cbnz(rscratch2, L_failed);
 2618 
 2619     // Check for flat inline type array -> return -1
 2620     __ test_flat_array_oop(src, rscratch2, L_failed);
 2621 
 2622     // Check for null-free (non-flat) inline type array -> handle as object array
 2623     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2624 
 2625     //  if (!src->is_Array()) return -1;
 2626     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2627 
 2628     // At this point, it is known to be a typeArray (array_tag 0x3).
 2629 #ifdef ASSERT
 2630     {
 2631       BLOCK_COMMENT("assert primitive array {");
 2632       Label L;
 2633       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2634       __ cmpw(lh, rscratch2);
 2635       __ br(Assembler::GE, L);
 2636       __ stop("must be a primitive array");
 2637       __ bind(L);
 2638       BLOCK_COMMENT("} assert primitive array done");
 2639     }
 2640 #endif
 2641 
 2642     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2643                            rscratch2, L_failed);
 2644 
 2645     // TypeArrayKlass
 2646     //
 2647     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2648     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2649     //
 2650 
 2651     const Register rscratch1_offset = rscratch1;    // array offset
 2652     const Register r15_elsize = lh; // element size
 2653 
 2654     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2655            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2656     __ add(src, src, rscratch1_offset);           // src array offset
 2657     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2658     BLOCK_COMMENT("choose copy loop based on element size");
 2659 
 2660     // next registers should be set before the jump to corresponding stub
 2661     const Register from     = c_rarg0;  // source array address
 2662     const Register to       = c_rarg1;  // destination array address
 2663     const Register count    = c_rarg2;  // elements count
 2664 
 2665     // 'from', 'to', 'count' registers should be set in such order
 2666     // since they are the same as 'src', 'src_pos', 'dst'.
 2667 
 2668     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2669 
 2670     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2671     // size in bytes).  We do a simple bitwise binary search.
 2672   __ BIND(L_copy_bytes);
 2673     __ tbnz(r15_elsize, 1, L_copy_ints);
 2674     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2675     __ lea(from, Address(src, src_pos));// src_addr
 2676     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2677     __ movw(count, scratch_length); // length
 2678     __ b(RuntimeAddress(byte_copy_entry));
 2679 
 2680   __ BIND(L_copy_shorts);
 2681     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2682     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2683     __ movw(count, scratch_length); // length
 2684     __ b(RuntimeAddress(short_copy_entry));
 2685 
 2686   __ BIND(L_copy_ints);
 2687     __ tbnz(r15_elsize, 0, L_copy_longs);
 2688     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2689     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2690     __ movw(count, scratch_length); // length
 2691     __ b(RuntimeAddress(int_copy_entry));
 2692 
 2693   __ BIND(L_copy_longs);
 2694 #ifdef ASSERT
 2695     {
 2696       BLOCK_COMMENT("assert long copy {");
 2697       Label L;
 2698       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2699       __ cmpw(r15_elsize, LogBytesPerLong);
 2700       __ br(Assembler::EQ, L);
 2701       __ stop("must be long copy, but elsize is wrong");
 2702       __ bind(L);
 2703       BLOCK_COMMENT("} assert long copy done");
 2704     }
 2705 #endif
 2706     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2707     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2708     __ movw(count, scratch_length); // length
 2709     __ b(RuntimeAddress(long_copy_entry));
 2710 
 2711     // ObjArrayKlass
 2712   __ BIND(L_objArray);
 2713     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2714 
 2715     Label L_plain_copy, L_checkcast_copy;
 2716     //  test array classes for subtyping
 2717     __ load_klass(r15, dst);
 2718     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2719     __ br(Assembler::NE, L_checkcast_copy);
 2720 
 2721     // Identically typed arrays can be copied without element-wise checks.
 2722     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2723                            rscratch2, L_failed);
 2724 
 2725     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2726     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2727     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2728     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2729     __ movw(count, scratch_length); // length
 2730   __ BIND(L_plain_copy);
 2731     __ b(RuntimeAddress(oop_copy_entry));
 2732 
 2733   __ BIND(L_checkcast_copy);
 2734     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2735     {
 2736       // Before looking at dst.length, make sure dst is also an objArray.
 2737       __ ldrw(rscratch1, Address(r15, lh_offset));
 2738       __ movw(rscratch2, objArray_lh);
 2739       __ eorw(rscratch1, rscratch1, rscratch2);
 2740       __ cbnzw(rscratch1, L_failed);
 2741 
 2742       // It is safe to examine both src.length and dst.length.
 2743       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2744                              r15, L_failed);
 2745 
 2746       __ load_klass(dst_klass, dst); // reload
 2747 
 2748       // Marshal the base address arguments now, freeing registers.
 2749       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2750       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2751       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2752       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2753       __ movw(count, length);           // length (reloaded)
 2754       Register sco_temp = c_rarg3;      // this register is free now
 2755       assert_different_registers(from, to, count, sco_temp,
 2756                                  dst_klass, scratch_src_klass);
 2757       // assert_clean_int(count, sco_temp);
 2758 
 2759       // Generate the type check.
 2760       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2761       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2762 
 2763       // Smashes rscratch1, rscratch2
 2764       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2765                           L_plain_copy);
 2766 
 2767       // Fetch destination element klass from the ObjArrayKlass header.
 2768       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2769       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2770       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2771 
 2772       // the checkcast_copy loop needs two extra arguments:
 2773       assert(c_rarg3 == sco_temp, "#3 already in place");
 2774       // Set up arguments for checkcast_copy_entry.
 2775       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2776       __ b(RuntimeAddress(checkcast_copy_entry));
 2777     }
 2778 
 2779   __ BIND(L_failed);
 2780     __ mov(r0, -1);
 2781     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2782     __ ret(lr);
 2783 
 2784     // record the stub entry and end
 2785     store_archive_data(stub_id, start, __ pc());
 2786 
 2787     return start;
 2788   }
 2789 
 2790   //
 2791   // Generate stub for array fill. If "aligned" is true, the
 2792   // "to" address is assumed to be heapword aligned.
 2793   //
 2794   // Arguments for generated stub:
 2795   //   to:    c_rarg0
 2796   //   value: c_rarg1
 2797   //   count: c_rarg2 treated as signed
 2798   //
 2799   address generate_fill(StubId stub_id) {
 2800     BasicType t;
 2801     bool aligned;
 2802 
 2803     switch (stub_id) {
 2804     case StubId::stubgen_jbyte_fill_id:
 2805       t = T_BYTE;
 2806       aligned = false;
 2807       break;
 2808     case StubId::stubgen_jshort_fill_id:
 2809       t = T_SHORT;
 2810       aligned = false;
 2811       break;
 2812     case StubId::stubgen_jint_fill_id:
 2813       t = T_INT;
 2814       aligned = false;
 2815       break;
 2816     case StubId::stubgen_arrayof_jbyte_fill_id:
 2817       t = T_BYTE;
 2818       aligned = true;
 2819       break;
 2820     case StubId::stubgen_arrayof_jshort_fill_id:
 2821       t = T_SHORT;
 2822       aligned = true;
 2823       break;
 2824     case StubId::stubgen_arrayof_jint_fill_id:
 2825       t = T_INT;
 2826       aligned = true;
 2827       break;
 2828     default:
 2829       ShouldNotReachHere();
 2830     };
 2831     int entry_count = StubInfo::entry_count(stub_id);
 2832     assert(entry_count == 1, "sanity check");
 2833     address start = load_archive_data(stub_id);
 2834     if (start != nullptr) {
 2835       return start;
 2836     }
 2837     __ align(CodeEntryAlignment);
 2838     StubCodeMark mark(this, stub_id);
 2839     start = __ pc();
 2840 
 2841     BLOCK_COMMENT("Entry:");
 2842 
 2843     const Register to        = c_rarg0;  // source array address
 2844     const Register value     = c_rarg1;  // value
 2845     const Register count     = c_rarg2;  // elements count
 2846 
 2847     const Register bz_base = r10;        // base for block_zero routine
 2848     const Register cnt_words = r11;      // temp register
 2849 
 2850     __ enter();
 2851 
 2852     Label L_fill_elements, L_exit1;
 2853 
 2854     int shift = -1;
 2855     switch (t) {
 2856       case T_BYTE:
 2857         shift = 0;
 2858         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2859         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2860         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2861         __ br(Assembler::LO, L_fill_elements);
 2862         break;
 2863       case T_SHORT:
 2864         shift = 1;
 2865         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2866         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2867         __ br(Assembler::LO, L_fill_elements);
 2868         break;
 2869       case T_INT:
 2870         shift = 2;
 2871         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2872         __ br(Assembler::LO, L_fill_elements);
 2873         break;
 2874       default: ShouldNotReachHere();
 2875     }
 2876 
 2877     // Align source address at 8 bytes address boundary.
 2878     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2879     if (!aligned) {
 2880       switch (t) {
 2881         case T_BYTE:
 2882           // One byte misalignment happens only for byte arrays.
 2883           __ tbz(to, 0, L_skip_align1);
 2884           __ strb(value, Address(__ post(to, 1)));
 2885           __ subw(count, count, 1);
 2886           __ bind(L_skip_align1);
 2887           // Fallthrough
 2888         case T_SHORT:
 2889           // Two bytes misalignment happens only for byte and short (char) arrays.
 2890           __ tbz(to, 1, L_skip_align2);
 2891           __ strh(value, Address(__ post(to, 2)));
 2892           __ subw(count, count, 2 >> shift);
 2893           __ bind(L_skip_align2);
 2894           // Fallthrough
 2895         case T_INT:
 2896           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2897           __ tbz(to, 2, L_skip_align4);
 2898           __ strw(value, Address(__ post(to, 4)));
 2899           __ subw(count, count, 4 >> shift);
 2900           __ bind(L_skip_align4);
 2901           break;
 2902         default: ShouldNotReachHere();
 2903       }
 2904     }
 2905 
 2906     //
 2907     //  Fill large chunks
 2908     //
 2909     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2910     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2911     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2912     if (UseBlockZeroing) {
 2913       Label non_block_zeroing, rest;
 2914       // If the fill value is zero we can use the fast zero_words().
 2915       __ cbnz(value, non_block_zeroing);
 2916       __ mov(bz_base, to);
 2917       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2918       address tpc = __ zero_words(bz_base, cnt_words);
 2919       if (tpc == nullptr) {
 2920         fatal("CodeCache is full at generate_fill");
 2921       }
 2922       __ b(rest);
 2923       __ bind(non_block_zeroing);
 2924       __ fill_words(to, cnt_words, value);
 2925       __ bind(rest);
 2926     } else {
 2927       __ fill_words(to, cnt_words, value);
 2928     }
 2929 
 2930     // Remaining count is less than 8 bytes. Fill it by a single store.
 2931     // Note that the total length is no less than 8 bytes.
 2932     if (t == T_BYTE || t == T_SHORT) {
 2933       Label L_exit1;
 2934       __ cbzw(count, L_exit1);
 2935       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2936       __ str(value, Address(to, -8));    // overwrite some elements
 2937       __ bind(L_exit1);
 2938       __ leave();
 2939       __ ret(lr);
 2940     }
 2941 
 2942     // Handle copies less than 8 bytes.
 2943     Label L_fill_2, L_fill_4, L_exit2;
 2944     __ bind(L_fill_elements);
 2945     switch (t) {
 2946       case T_BYTE:
 2947         __ tbz(count, 0, L_fill_2);
 2948         __ strb(value, Address(__ post(to, 1)));
 2949         __ bind(L_fill_2);
 2950         __ tbz(count, 1, L_fill_4);
 2951         __ strh(value, Address(__ post(to, 2)));
 2952         __ bind(L_fill_4);
 2953         __ tbz(count, 2, L_exit2);
 2954         __ strw(value, Address(to));
 2955         break;
 2956       case T_SHORT:
 2957         __ tbz(count, 0, L_fill_4);
 2958         __ strh(value, Address(__ post(to, 2)));
 2959         __ bind(L_fill_4);
 2960         __ tbz(count, 1, L_exit2);
 2961         __ strw(value, Address(to));
 2962         break;
 2963       case T_INT:
 2964         __ cbzw(count, L_exit2);
 2965         __ strw(value, Address(to));
 2966         break;
 2967       default: ShouldNotReachHere();
 2968     }
 2969     __ bind(L_exit2);
 2970     __ leave();
 2971     __ ret(lr);
 2972 
 2973     // record the stub entry and end
 2974     store_archive_data(stub_id, start, __ pc());
 2975 
 2976     return start;
 2977   }
 2978 
 2979   address generate_unsafecopy_common_error_exit() {
 2980     StubId stub_id = StubId::stubgen_unsafecopy_common_id;
 2981     int entry_count = StubInfo::entry_count(stub_id);
 2982     assert(entry_count == 1, "sanity check");
 2983     address start = load_archive_data(stub_id);
 2984     if (start != nullptr) {
 2985       return start;
 2986     }
 2987     __ align(CodeEntryAlignment);
 2988     StubCodeMark mark(this, stub_id);
 2989     start = __ pc();
 2990       __ leave();
 2991       __ mov(r0, 0);
 2992       __ ret(lr);
 2993 
 2994     // record the stub entry and end
 2995     store_archive_data(stub_id, start, __ pc());
 2996 
 2997     return start;
 2998   }
 2999 
 3000   //
 3001   //  Generate 'unsafe' set memory stub
 3002   //  Though just as safe as the other stubs, it takes an unscaled
 3003   //  size_t (# bytes) argument instead of an element count.
 3004   //
 3005   //  This fill operation is atomicity preserving: as long as the
 3006   //  address supplied is sufficiently aligned, all writes of up to 64
 3007   //  bits in size are single-copy atomic.
 3008   //
 3009   //  Input:
 3010   //    c_rarg0   - destination array address
 3011   //    c_rarg1   - byte count (size_t)
 3012   //    c_rarg2   - byte value
 3013   //
 3014   address generate_unsafe_setmemory() {
 3015     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
 3016     int entry_count = StubInfo::entry_count(stub_id);
 3017     assert(entry_count == 1, "sanity check");
 3018     // we expect one set of extra unsafememory access handler entries
 3019     GrowableArray<address> extras;
 3020     int extra_count =  1 * UnsafeMemoryAccess::COLUMN_COUNT;
 3021     address start = load_archive_data(stub_id, nullptr, &extras);
 3022     if (start != nullptr) {
 3023       assert(extras.length() == extra_count,
 3024              "unexpected extra entry count %d", extras.length());
 3025       register_unsafe_access_handlers(extras, 0, 1);
 3026       return start;
 3027     }
 3028 
 3029     __ align(CodeEntryAlignment);
 3030     StubCodeMark mark(this, stub_id);
 3031     start = __ pc();
 3032 
 3033     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 3034     Label tail;
 3035 
 3036     {
 3037     UnsafeMemoryAccessMark umam(this, true, false);
 3038 
 3039     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3040 
 3041     __ dup(v0, __ T16B, value);
 3042 
 3043     if (AvoidUnalignedAccesses) {
 3044       __ cmp(count, (u1)16);
 3045       __ br(__ LO, tail);
 3046 
 3047       __ mov(rscratch1, 16);
 3048       __ andr(rscratch2, dest, 15);
 3049       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 3050       __ strq(v0, Address(dest));
 3051       __ sub(count, count, rscratch1);
 3052       __ add(dest, dest, rscratch1);
 3053     }
 3054 
 3055     __ subs(count, count, (u1)64);
 3056     __ br(__ LO, tail);
 3057     {
 3058       Label again;
 3059       __ bind(again);
 3060       __ stpq(v0, v0, Address(dest));
 3061       __ stpq(v0, v0, Address(dest, 32));
 3062 
 3063       __ subs(count, count, 64);
 3064       __ add(dest, dest, 64);
 3065       __ br(__ HS, again);
 3066     }
 3067 
 3068     __ bind(tail);
 3069     // The count of bytes is off by 64, but we don't need to correct
 3070     // it because we're only going to use the least-significant few
 3071     // count bits from here on.
 3072     // __ add(count, count, 64);
 3073 
 3074     {
 3075       Label dont;
 3076       __ tbz(count, exact_log2(32), dont);
 3077       __ stpq(v0, v0, __ post(dest, 32));
 3078       __ bind(dont);
 3079     }
 3080     {
 3081       Label dont;
 3082       __ tbz(count, exact_log2(16), dont);
 3083       __ strq(v0, __ post(dest, 16));
 3084       __ bind(dont);
 3085     }
 3086     {
 3087       Label dont;
 3088       __ tbz(count, exact_log2(8), dont);
 3089       __ strd(v0, __ post(dest, 8));
 3090       __ bind(dont);
 3091     }
 3092 
 3093     Label finished;
 3094     __ tst(count, 7);
 3095     __ br(__ EQ, finished);
 3096 
 3097     {
 3098       Label dont;
 3099       __ tbz(count, exact_log2(4), dont);
 3100       __ strs(v0, __ post(dest, 4));
 3101       __ bind(dont);
 3102     }
 3103     {
 3104       Label dont;
 3105       __ tbz(count, exact_log2(2), dont);
 3106       __ bfi(value, value, 8, 8);
 3107       __ strh(value, __ post(dest, 2));
 3108       __ bind(dont);
 3109     }
 3110     {
 3111       Label dont;
 3112       __ tbz(count, exact_log2(1), dont);
 3113       __ strb(value, Address(dest));
 3114       __ bind(dont);
 3115     }
 3116 
 3117     __ bind(finished);
 3118     __ leave();
 3119     __ ret(lr);
 3120     // have to exit the block and destroy the UnsafeMemoryAccessMark
 3121     // in order to retrieve the handler end address
 3122     }
 3123 
 3124     // install saved handler addresses in extras
 3125     address end = __ pc();
 3126     retrieve_unsafe_access_handlers(start, end, extras);
 3127     assert(extras.length() == extra_count,
 3128            "incorrect handlers count %d", extras.length());
 3129     // record the stub entry and end plus the extras
 3130     store_archive_data(stub_id, start, end, nullptr, &extras);
 3131 
 3132     return start;
 3133   }
 3134 
 3135   address generate_data_cache_writeback() {
 3136     const Register line        = c_rarg0;  // address of line to write back
 3137 
 3138     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 3139     int entry_count = StubInfo::entry_count(stub_id);
 3140     assert(entry_count == 1, "sanity check");
 3141     address start = load_archive_data(stub_id);
 3142     if (start != nullptr) {
 3143       return start;
 3144     }
 3145     __ align(CodeEntryAlignment);
 3146     StubCodeMark mark(this, stub_id);
 3147 
 3148     start = __ pc();
 3149     __ enter();
 3150     __ cache_wb(Address(line, 0));
 3151     __ leave();
 3152     __ ret(lr);
 3153 
 3154     // record the stub entry and end
 3155     store_archive_data(stub_id, start, __ pc());
 3156 
 3157     return start;
 3158   }
 3159 
 3160   address generate_data_cache_writeback_sync() {
 3161     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 3162     int entry_count = StubInfo::entry_count(stub_id);
 3163     assert(entry_count == 1, "sanity check");
 3164     address start = load_archive_data(stub_id);
 3165     if (start != nullptr) {
 3166       return start;
 3167     }
 3168     const Register is_pre     = c_rarg0;  // pre or post sync
 3169     __ align(CodeEntryAlignment);
 3170     StubCodeMark mark(this, stub_id);
 3171 
 3172     // pre wbsync is a no-op
 3173     // post wbsync translates to an sfence
 3174 
 3175     Label skip;
 3176     start = __ pc();
 3177     __ enter();
 3178     __ cbnz(is_pre, skip);
 3179     __ cache_wbsync(false);
 3180     __ bind(skip);
 3181     __ leave();
 3182     __ ret(lr);
 3183 
 3184     // record the stub entry and end
 3185     store_archive_data(stub_id, start, __ pc());
 3186 
 3187     return start;
 3188   }
 3189 
 3190   void generate_arraycopy_stubs() {
 3191     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 3192     // entry immediately following their stack push. This can be used
 3193     // as a post-push branch target for compatible stubs when they
 3194     // identify a special case that can be handled by the fallback
 3195     // stub e.g a disjoint copy stub may be use as a special case
 3196     // fallback for its compatible conjoint copy stub.
 3197     //
 3198     // A no push entry is always returned in the following local and
 3199     // then published by assigning to the appropriate entry field in
 3200     // class StubRoutines. The entry value is then passed to the
 3201     // generator for the compatible stub. That means the entry must be
 3202     // listed when saving to/restoring from the AOT cache, ensuring
 3203     // that the inter-stub jumps are noted at AOT-cache save and
 3204     // relocated at AOT cache load.
 3205     address nopush_entry;
 3206 
 3207     // generate the common exit first so later stubs can rely on it if
 3208     // they want an UnsafeMemoryAccess exit non-local to the stub
 3209     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 3210     // register the stub as the default exit with class UnsafeMemoryAccess
 3211     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 3212 
 3213     // generate and publish arch64-specific bulk copy routines first
 3214     // so we can call them from other copy stubs
 3215     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3216     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3217 
 3218     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3219     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3220 
 3221     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3222     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3223 
 3224     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 3225 
 3226     //*** jbyte
 3227     // Always need aligned and unaligned versions
 3228     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3229     // disjoint nopush entry is needed by conjoint copy
 3230     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3231     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 3232     // conjoint nopush entry is needed by generic/unsafe copy
 3233     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 3234     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3235     // disjoint arrayof nopush entry is needed by conjoint copy
 3236     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3237     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 3238 
 3239     //*** jshort
 3240     // Always need aligned and unaligned versions
 3241     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 3242     // disjoint nopush entry is needed by conjoint copy
 3243     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 3244     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 3245     // conjoint nopush entry is used by generic/unsafe copy
 3246     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 3247     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 3248     // disjoint arrayof nopush entry is needed by conjoint copy
 3249     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 3250     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 3251 
 3252     //*** jint
 3253     // Aligned versions
 3254     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 3255     // disjoint arrayof nopush entry is needed by conjoint copy
 3256     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 3257     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 3258     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 3259     // jint_arraycopy_nopush always points to the unaligned version
 3260     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 3261     // disjoint nopush entry is needed by conjoint copy
 3262     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 3263     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 3264     // conjoint nopush entry is needed by generic/unsafe copy
 3265     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 3266 
 3267     //*** jlong
 3268     // It is always aligned
 3269     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 3270     // disjoint arrayof nopush entry is needed by conjoint copy
 3271     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 3272     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 3273     // conjoint nopush entry is needed by generic/unsafe copy
 3274     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 3275     // disjoint normal/nopush and conjoint normal entries are not
 3276     // generated since the arrayof versions are the same
 3277     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 3278     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 3279     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 3280 
 3281     //*** oops
 3282     {
 3283       StubRoutines::_arrayof_oop_disjoint_arraycopy
 3284         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 3285       // disjoint arrayof nopush entry is needed by conjoint copy
 3286       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 3287       StubRoutines::_arrayof_oop_arraycopy
 3288         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 3289       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 3290       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 3291       // Aligned versions without pre-barriers
 3292       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 3293         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 3294       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 3295       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 3296       // note that we don't need a returned nopush entry because the
 3297       // generic/unsafe copy does not cater for uninit arrays.
 3298       StubRoutines::_arrayof_oop_arraycopy_uninit
 3299         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 3300     }
 3301 
 3302     // for oop copies reuse arrayof entries for non-arrayof cases
 3303     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 3304     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 3305     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 3306     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 3307     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 3308     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 3309 
 3310     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 3311     // checkcast nopush entry is needed by generic copy
 3312     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 3313     // note that we don't need a returned nopush entry because the
 3314     // generic copy does not cater for uninit arrays.
 3315     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 3316 
 3317     // unsafe arraycopy may fallback on conjoint stubs
 3318     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3319                                                               StubRoutines::_jshort_arraycopy_nopush,
 3320                                                               StubRoutines::_jint_arraycopy_nopush,
 3321                                                               StubRoutines::_jlong_arraycopy_nopush);
 3322 
 3323     // generic arraycopy may fallback on conjoint stubs
 3324     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3325                                                                StubRoutines::_jshort_arraycopy_nopush,
 3326                                                                StubRoutines::_jint_arraycopy_nopush,
 3327                                                                StubRoutines::_oop_arraycopy_nopush,
 3328                                                                StubRoutines::_jlong_arraycopy_nopush,
 3329                                                                StubRoutines::_checkcast_arraycopy_nopush);
 3330 
 3331     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 3332     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 3333     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 3334     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 3335     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 3336     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 3337   }
 3338 
 3339   void generate_math_stubs() { Unimplemented(); }
 3340 
 3341   // Arguments:
 3342   //
 3343   // Inputs:
 3344   //   c_rarg0   - source byte array address
 3345   //   c_rarg1   - destination byte array address
 3346   //   c_rarg2   - sessionKe (key) in little endian int array
 3347   //
 3348   address generate_aescrypt_encryptBlock() {
 3349     assert(UseAES, "need AES cryptographic extension support");
 3350     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 3351     int entry_count = StubInfo::entry_count(stub_id);
 3352     assert(entry_count == 1, "sanity check");
 3353     address start = load_archive_data(stub_id);
 3354     if (start != nullptr) {
 3355       return start;
 3356     }
 3357     __ align(CodeEntryAlignment);
 3358     StubCodeMark mark(this, stub_id);
 3359 
 3360     const Register from        = c_rarg0;  // source array address
 3361     const Register to          = c_rarg1;  // destination array address
 3362     const Register key         = c_rarg2;  // key array address
 3363     const Register keylen      = rscratch1;
 3364 
 3365     start = __ pc();
 3366     __ enter();
 3367 
 3368     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3369 
 3370     __ aesenc_loadkeys(key, keylen);
 3371     __ aesecb_encrypt(from, to, keylen);
 3372 
 3373     __ mov(r0, 0);
 3374 
 3375     __ leave();
 3376     __ ret(lr);
 3377 
 3378     // record the stub entry and end
 3379     store_archive_data(stub_id, start, __ pc());
 3380 
 3381     return start;
 3382   }
 3383 
 3384   // Arguments:
 3385   //
 3386   // Inputs:
 3387   //   c_rarg0   - source byte array address
 3388   //   c_rarg1   - destination byte array address
 3389   //   c_rarg2   - sessionKd (key) in little endian int array
 3390   //
 3391   address generate_aescrypt_decryptBlock() {
 3392     assert(UseAES, "need AES cryptographic extension support");
 3393     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 3394     int entry_count = StubInfo::entry_count(stub_id);
 3395     assert(entry_count == 1, "sanity check");
 3396     address start = load_archive_data(stub_id);
 3397     if (start != nullptr) {
 3398       return start;
 3399     }
 3400     __ align(CodeEntryAlignment);
 3401     StubCodeMark mark(this, stub_id);
 3402     Label L_doLast;
 3403 
 3404     const Register from        = c_rarg0;  // source array address
 3405     const Register to          = c_rarg1;  // destination array address
 3406     const Register key         = c_rarg2;  // key array address
 3407     const Register keylen      = rscratch1;
 3408 
 3409     start = __ pc();
 3410     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3411 
 3412     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3413 
 3414     __ aesecb_decrypt(from, to, key, keylen);
 3415 
 3416     __ mov(r0, 0);
 3417 
 3418     __ leave();
 3419     __ ret(lr);
 3420 
 3421     // record the stub entry and end
 3422     store_archive_data(stub_id, start, __ pc());
 3423 
 3424     return start;
 3425   }
 3426 
 3427   // Arguments:
 3428   //
 3429   // Inputs:
 3430   //   c_rarg0   - source byte array address
 3431   //   c_rarg1   - destination byte array address
 3432   //   c_rarg2   - sessionKe (key) in little endian int array
 3433   //   c_rarg3   - r vector byte array address
 3434   //   c_rarg4   - input length
 3435   //
 3436   // Output:
 3437   //   x0        - input length
 3438   //
 3439   address generate_cipherBlockChaining_encryptAESCrypt() {
 3440     assert(UseAES, "need AES cryptographic extension support");
 3441     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 3442     int entry_count = StubInfo::entry_count(stub_id);
 3443     assert(entry_count == 1, "sanity check");
 3444     address start = load_archive_data(stub_id);
 3445     if (start != nullptr) {
 3446       return start;
 3447     }
 3448     __ align(CodeEntryAlignment);
 3449     StubCodeMark mark(this, stub_id);
 3450 
 3451     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3452 
 3453     const Register from        = c_rarg0;  // source array address
 3454     const Register to          = c_rarg1;  // destination array address
 3455     const Register key         = c_rarg2;  // key array address
 3456     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3457                                            // and left with the results of the last encryption block
 3458     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3459     const Register keylen      = rscratch1;
 3460 
 3461     start = __ pc();
 3462 
 3463       __ enter();
 3464 
 3465       __ movw(rscratch2, len_reg);
 3466 
 3467       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3468 
 3469       __ ld1(v0, __ T16B, rvec);
 3470 
 3471       __ cmpw(keylen, 52);
 3472       __ br(Assembler::CC, L_loadkeys_44);
 3473       __ br(Assembler::EQ, L_loadkeys_52);
 3474 
 3475       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3476       __ rev32(v17, __ T16B, v17);
 3477       __ rev32(v18, __ T16B, v18);
 3478     __ BIND(L_loadkeys_52);
 3479       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3480       __ rev32(v19, __ T16B, v19);
 3481       __ rev32(v20, __ T16B, v20);
 3482     __ BIND(L_loadkeys_44);
 3483       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3484       __ rev32(v21, __ T16B, v21);
 3485       __ rev32(v22, __ T16B, v22);
 3486       __ rev32(v23, __ T16B, v23);
 3487       __ rev32(v24, __ T16B, v24);
 3488       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3489       __ rev32(v25, __ T16B, v25);
 3490       __ rev32(v26, __ T16B, v26);
 3491       __ rev32(v27, __ T16B, v27);
 3492       __ rev32(v28, __ T16B, v28);
 3493       __ ld1(v29, v30, v31, __ T16B, key);
 3494       __ rev32(v29, __ T16B, v29);
 3495       __ rev32(v30, __ T16B, v30);
 3496       __ rev32(v31, __ T16B, v31);
 3497 
 3498     __ BIND(L_aes_loop);
 3499       __ ld1(v1, __ T16B, __ post(from, 16));
 3500       __ eor(v0, __ T16B, v0, v1);
 3501 
 3502       __ br(Assembler::CC, L_rounds_44);
 3503       __ br(Assembler::EQ, L_rounds_52);
 3504 
 3505       __ aese(v0, v17); __ aesmc(v0, v0);
 3506       __ aese(v0, v18); __ aesmc(v0, v0);
 3507     __ BIND(L_rounds_52);
 3508       __ aese(v0, v19); __ aesmc(v0, v0);
 3509       __ aese(v0, v20); __ aesmc(v0, v0);
 3510     __ BIND(L_rounds_44);
 3511       __ aese(v0, v21); __ aesmc(v0, v0);
 3512       __ aese(v0, v22); __ aesmc(v0, v0);
 3513       __ aese(v0, v23); __ aesmc(v0, v0);
 3514       __ aese(v0, v24); __ aesmc(v0, v0);
 3515       __ aese(v0, v25); __ aesmc(v0, v0);
 3516       __ aese(v0, v26); __ aesmc(v0, v0);
 3517       __ aese(v0, v27); __ aesmc(v0, v0);
 3518       __ aese(v0, v28); __ aesmc(v0, v0);
 3519       __ aese(v0, v29); __ aesmc(v0, v0);
 3520       __ aese(v0, v30);
 3521       __ eor(v0, __ T16B, v0, v31);
 3522 
 3523       __ st1(v0, __ T16B, __ post(to, 16));
 3524 
 3525       __ subw(len_reg, len_reg, 16);
 3526       __ cbnzw(len_reg, L_aes_loop);
 3527 
 3528       __ st1(v0, __ T16B, rvec);
 3529 
 3530       __ mov(r0, rscratch2);
 3531 
 3532       __ leave();
 3533       __ ret(lr);
 3534 
 3535       // record the stub entry and end
 3536       store_archive_data(stub_id, start, __ pc());
 3537 
 3538       return start;
 3539   }
 3540 
 3541   // Arguments:
 3542   //
 3543   // Inputs:
 3544   //   c_rarg0   - source byte array address
 3545   //   c_rarg1   - destination byte array address
 3546   //   c_rarg2   - sessionKd (key) in little endian int array
 3547   //   c_rarg3   - r vector byte array address
 3548   //   c_rarg4   - input length
 3549   //
 3550   // Output:
 3551   //   r0        - input length
 3552   //
 3553   address generate_cipherBlockChaining_decryptAESCrypt() {
 3554     assert(UseAES, "need AES cryptographic extension support");
 3555     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3556     int entry_count = StubInfo::entry_count(stub_id);
 3557     assert(entry_count == 1, "sanity check");
 3558     address start = load_archive_data(stub_id);
 3559     if (start != nullptr) {
 3560       return start;
 3561     }
 3562     __ align(CodeEntryAlignment);
 3563     StubCodeMark mark(this, stub_id);
 3564 
 3565     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3566 
 3567     const Register from        = c_rarg0;  // source array address
 3568     const Register to          = c_rarg1;  // destination array address
 3569     const Register key         = c_rarg2;  // key array address
 3570     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3571                                            // and left with the results of the last encryption block
 3572     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3573     const Register keylen      = rscratch1;
 3574 
 3575     start = __ pc();
 3576 
 3577       __ enter();
 3578 
 3579       __ movw(rscratch2, len_reg);
 3580 
 3581       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3582 
 3583       __ ld1(v2, __ T16B, rvec);
 3584 
 3585       __ ld1(v31, __ T16B, __ post(key, 16));
 3586       __ rev32(v31, __ T16B, v31);
 3587 
 3588       __ cmpw(keylen, 52);
 3589       __ br(Assembler::CC, L_loadkeys_44);
 3590       __ br(Assembler::EQ, L_loadkeys_52);
 3591 
 3592       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3593       __ rev32(v17, __ T16B, v17);
 3594       __ rev32(v18, __ T16B, v18);
 3595     __ BIND(L_loadkeys_52);
 3596       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3597       __ rev32(v19, __ T16B, v19);
 3598       __ rev32(v20, __ T16B, v20);
 3599     __ BIND(L_loadkeys_44);
 3600       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3601       __ rev32(v21, __ T16B, v21);
 3602       __ rev32(v22, __ T16B, v22);
 3603       __ rev32(v23, __ T16B, v23);
 3604       __ rev32(v24, __ T16B, v24);
 3605       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3606       __ rev32(v25, __ T16B, v25);
 3607       __ rev32(v26, __ T16B, v26);
 3608       __ rev32(v27, __ T16B, v27);
 3609       __ rev32(v28, __ T16B, v28);
 3610       __ ld1(v29, v30, __ T16B, key);
 3611       __ rev32(v29, __ T16B, v29);
 3612       __ rev32(v30, __ T16B, v30);
 3613 
 3614     __ BIND(L_aes_loop);
 3615       __ ld1(v0, __ T16B, __ post(from, 16));
 3616       __ orr(v1, __ T16B, v0, v0);
 3617 
 3618       __ br(Assembler::CC, L_rounds_44);
 3619       __ br(Assembler::EQ, L_rounds_52);
 3620 
 3621       __ aesd(v0, v17); __ aesimc(v0, v0);
 3622       __ aesd(v0, v18); __ aesimc(v0, v0);
 3623     __ BIND(L_rounds_52);
 3624       __ aesd(v0, v19); __ aesimc(v0, v0);
 3625       __ aesd(v0, v20); __ aesimc(v0, v0);
 3626     __ BIND(L_rounds_44);
 3627       __ aesd(v0, v21); __ aesimc(v0, v0);
 3628       __ aesd(v0, v22); __ aesimc(v0, v0);
 3629       __ aesd(v0, v23); __ aesimc(v0, v0);
 3630       __ aesd(v0, v24); __ aesimc(v0, v0);
 3631       __ aesd(v0, v25); __ aesimc(v0, v0);
 3632       __ aesd(v0, v26); __ aesimc(v0, v0);
 3633       __ aesd(v0, v27); __ aesimc(v0, v0);
 3634       __ aesd(v0, v28); __ aesimc(v0, v0);
 3635       __ aesd(v0, v29); __ aesimc(v0, v0);
 3636       __ aesd(v0, v30);
 3637       __ eor(v0, __ T16B, v0, v31);
 3638       __ eor(v0, __ T16B, v0, v2);
 3639 
 3640       __ st1(v0, __ T16B, __ post(to, 16));
 3641       __ orr(v2, __ T16B, v1, v1);
 3642 
 3643       __ subw(len_reg, len_reg, 16);
 3644       __ cbnzw(len_reg, L_aes_loop);
 3645 
 3646       __ st1(v2, __ T16B, rvec);
 3647 
 3648       __ mov(r0, rscratch2);
 3649 
 3650       __ leave();
 3651       __ ret(lr);
 3652 
 3653     // record the stub entry and end
 3654     store_archive_data(stub_id, start, __ pc());
 3655 
 3656     return start;
 3657   }
 3658 
 3659   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3660   // Inputs: 128-bits. in is preserved.
 3661   // The least-significant 64-bit word is in the upper dword of each vector.
 3662   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3663   // Output: result
 3664   void be_add_128_64(FloatRegister result, FloatRegister in,
 3665                      FloatRegister inc, FloatRegister tmp) {
 3666     assert_different_registers(result, tmp, inc);
 3667 
 3668     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3669                                            // input
 3670     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3671     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3672                                            // MSD == 0 (must be!) to LSD
 3673     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3674   }
 3675 
 3676   // CTR AES crypt.
 3677   // Arguments:
 3678   //
 3679   // Inputs:
 3680   //   c_rarg0   - source byte array address
 3681   //   c_rarg1   - destination byte array address
 3682   //   c_rarg2   - sessionKe (key) in little endian int array
 3683   //   c_rarg3   - counter vector byte array address
 3684   //   c_rarg4   - input length
 3685   //   c_rarg5   - saved encryptedCounter start
 3686   //   c_rarg6   - saved used length
 3687   //
 3688   // Output:
 3689   //   r0       - input length
 3690   //
 3691   address generate_counterMode_AESCrypt() {
 3692     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3693     int entry_count = StubInfo::entry_count(stub_id);
 3694     assert(entry_count == 1, "sanity check");
 3695     address start = load_archive_data(stub_id);
 3696     if (start != nullptr) {
 3697       return start;
 3698     }
 3699     const Register in = c_rarg0;
 3700     const Register out = c_rarg1;
 3701     const Register key = c_rarg2;
 3702     const Register counter = c_rarg3;
 3703     const Register saved_len = c_rarg4, len = r10;
 3704     const Register saved_encrypted_ctr = c_rarg5;
 3705     const Register used_ptr = c_rarg6, used = r12;
 3706 
 3707     const Register offset = r7;
 3708     const Register keylen = r11;
 3709 
 3710     const unsigned char block_size = 16;
 3711     const int bulk_width = 4;
 3712     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3713     // performance with larger data sizes, but it also means that the
 3714     // fast path isn't used until you have at least 8 blocks, and up
 3715     // to 127 bytes of data will be executed on the slow path. For
 3716     // that reason, and also so as not to blow away too much icache, 4
 3717     // blocks seems like a sensible compromise.
 3718 
 3719     // Algorithm:
 3720     //
 3721     //    if (len == 0) {
 3722     //        goto DONE;
 3723     //    }
 3724     //    int result = len;
 3725     //    do {
 3726     //        if (used >= blockSize) {
 3727     //            if (len >= bulk_width * blockSize) {
 3728     //                CTR_large_block();
 3729     //                if (len == 0)
 3730     //                    goto DONE;
 3731     //            }
 3732     //            for (;;) {
 3733     //                16ByteVector v0 = counter;
 3734     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3735     //                used = 0;
 3736     //                if (len < blockSize)
 3737     //                    break;    /* goto NEXT */
 3738     //                16ByteVector v1 = load16Bytes(in, offset);
 3739     //                v1 = v1 ^ encryptedCounter;
 3740     //                store16Bytes(out, offset);
 3741     //                used = blockSize;
 3742     //                offset += blockSize;
 3743     //                len -= blockSize;
 3744     //                if (len == 0)
 3745     //                    goto DONE;
 3746     //            }
 3747     //        }
 3748     //      NEXT:
 3749     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3750     //        len--;
 3751     //    } while (len != 0);
 3752     //  DONE:
 3753     //    return result;
 3754     //
 3755     // CTR_large_block()
 3756     //    Wide bulk encryption of whole blocks.
 3757 
 3758     __ align(CodeEntryAlignment);
 3759     StubCodeMark mark(this, stub_id);
 3760     start = __ pc();
 3761     __ enter();
 3762 
 3763     Label DONE, CTR_large_block, large_block_return;
 3764     __ ldrw(used, Address(used_ptr));
 3765     __ cbzw(saved_len, DONE);
 3766 
 3767     __ mov(len, saved_len);
 3768     __ mov(offset, 0);
 3769 
 3770     // Compute #rounds for AES based on the length of the key array
 3771     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3772 
 3773     __ aesenc_loadkeys(key, keylen);
 3774 
 3775     {
 3776       Label L_CTR_loop, NEXT;
 3777 
 3778       __ bind(L_CTR_loop);
 3779 
 3780       __ cmp(used, block_size);
 3781       __ br(__ LO, NEXT);
 3782 
 3783       // Maybe we have a lot of data
 3784       __ subsw(rscratch1, len, bulk_width * block_size);
 3785       __ br(__ HS, CTR_large_block);
 3786       __ BIND(large_block_return);
 3787       __ cbzw(len, DONE);
 3788 
 3789       // Setup the counter
 3790       __ movi(v4, __ T4S, 0);
 3791       __ movi(v5, __ T4S, 1);
 3792       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3793 
 3794       // 128-bit big-endian increment
 3795       __ ld1(v0, __ T16B, counter);
 3796       __ rev64(v16, __ T16B, v0);
 3797       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3798       __ rev64(v16, __ T16B, v16);
 3799       __ st1(v16, __ T16B, counter);
 3800       // Previous counter value is in v0
 3801       // v4 contains { 0, 1 }
 3802 
 3803       {
 3804         // We have fewer than bulk_width blocks of data left. Encrypt
 3805         // them one by one until there is less than a full block
 3806         // remaining, being careful to save both the encrypted counter
 3807         // and the counter.
 3808 
 3809         Label inner_loop;
 3810         __ bind(inner_loop);
 3811         // Counter to encrypt is in v0
 3812         __ aesecb_encrypt(noreg, noreg, keylen);
 3813         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3814 
 3815         // Do we have a remaining full block?
 3816 
 3817         __ mov(used, 0);
 3818         __ cmp(len, block_size);
 3819         __ br(__ LO, NEXT);
 3820 
 3821         // Yes, we have a full block
 3822         __ ldrq(v1, Address(in, offset));
 3823         __ eor(v1, __ T16B, v1, v0);
 3824         __ strq(v1, Address(out, offset));
 3825         __ mov(used, block_size);
 3826         __ add(offset, offset, block_size);
 3827 
 3828         __ subw(len, len, block_size);
 3829         __ cbzw(len, DONE);
 3830 
 3831         // Increment the counter, store it back
 3832         __ orr(v0, __ T16B, v16, v16);
 3833         __ rev64(v16, __ T16B, v16);
 3834         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3835         __ rev64(v16, __ T16B, v16);
 3836         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3837 
 3838         __ b(inner_loop);
 3839       }
 3840 
 3841       __ BIND(NEXT);
 3842 
 3843       // Encrypt a single byte, and loop.
 3844       // We expect this to be a rare event.
 3845       __ ldrb(rscratch1, Address(in, offset));
 3846       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3847       __ eor(rscratch1, rscratch1, rscratch2);
 3848       __ strb(rscratch1, Address(out, offset));
 3849       __ add(offset, offset, 1);
 3850       __ add(used, used, 1);
 3851       __ subw(len, len,1);
 3852       __ cbnzw(len, L_CTR_loop);
 3853     }
 3854 
 3855     __ bind(DONE);
 3856     __ strw(used, Address(used_ptr));
 3857     __ mov(r0, saved_len);
 3858 
 3859     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3860     __ ret(lr);
 3861 
 3862     // Bulk encryption
 3863 
 3864     __ BIND (CTR_large_block);
 3865     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3866 
 3867     if (bulk_width == 8) {
 3868       __ sub(sp, sp, 4 * 16);
 3869       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3870     }
 3871     __ sub(sp, sp, 4 * 16);
 3872     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3873     RegSet saved_regs = (RegSet::of(in, out, offset)
 3874                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3875     __ push(saved_regs, sp);
 3876     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3877     __ add(in, in, offset);
 3878     __ add(out, out, offset);
 3879 
 3880     // Keys should already be loaded into the correct registers
 3881 
 3882     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3883     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3884 
 3885     // AES/CTR loop
 3886     {
 3887       Label L_CTR_loop;
 3888       __ BIND(L_CTR_loop);
 3889 
 3890       // Setup the counters
 3891       __ movi(v8, __ T4S, 0);
 3892       __ movi(v9, __ T4S, 1);
 3893       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3894 
 3895       for (int i = 0; i < bulk_width; i++) {
 3896         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3897         __ rev64(v0_ofs, __ T16B, v16);
 3898         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3899       }
 3900 
 3901       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3902 
 3903       // Encrypt the counters
 3904       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3905 
 3906       if (bulk_width == 8) {
 3907         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3908       }
 3909 
 3910       // XOR the encrypted counters with the inputs
 3911       for (int i = 0; i < bulk_width; i++) {
 3912         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3913         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3914         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3915       }
 3916 
 3917       // Write the encrypted data
 3918       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3919       if (bulk_width == 8) {
 3920         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3921       }
 3922 
 3923       __ subw(len, len, 16 * bulk_width);
 3924       __ cbnzw(len, L_CTR_loop);
 3925     }
 3926 
 3927     // Save the counter back where it goes
 3928     __ rev64(v16, __ T16B, v16);
 3929     __ st1(v16, __ T16B, counter);
 3930 
 3931     __ pop(saved_regs, sp);
 3932 
 3933     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3934     if (bulk_width == 8) {
 3935       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3936     }
 3937 
 3938     __ andr(rscratch1, len, -16 * bulk_width);
 3939     __ sub(len, len, rscratch1);
 3940     __ add(offset, offset, rscratch1);
 3941     __ mov(used, 16);
 3942     __ strw(used, Address(used_ptr));
 3943     __ b(large_block_return);
 3944 
 3945     // record the stub entry and end
 3946     store_archive_data(stub_id, start, __ pc());
 3947 
 3948     return start;
 3949   }
 3950 
 3951   // Vector AES Galois Counter Mode implementation. Parameters:
 3952   //
 3953   // in = c_rarg0
 3954   // len = c_rarg1
 3955   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3956   // out = c_rarg3
 3957   // key = c_rarg4
 3958   // state = c_rarg5 - GHASH.state
 3959   // subkeyHtbl = c_rarg6 - powers of H
 3960   // counter = c_rarg7 - 16 bytes of CTR
 3961   // return - number of processed bytes
 3962   address generate_galoisCounterMode_AESCrypt() {
 3963     Label ghash_polynomial; // local data generated after code
 3964     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3965     int entry_count = StubInfo::entry_count(stub_id);
 3966     assert(entry_count == 1, "sanity check");
 3967     address start = load_archive_data(stub_id);
 3968     if (start != nullptr) {
 3969       return start;
 3970     }
 3971     __ align(CodeEntryAlignment);
 3972     StubCodeMark mark(this, stub_id);
 3973     start = __ pc();
 3974     __ enter();
 3975 
 3976     const Register in = c_rarg0;
 3977     const Register len = c_rarg1;
 3978     const Register ct = c_rarg2;
 3979     const Register out = c_rarg3;
 3980     // and updated with the incremented counter in the end
 3981 
 3982     const Register key = c_rarg4;
 3983     const Register state = c_rarg5;
 3984 
 3985     const Register subkeyHtbl = c_rarg6;
 3986 
 3987     const Register counter = c_rarg7;
 3988 
 3989     const Register keylen = r10;
 3990     // Save state before entering routine
 3991     __ sub(sp, sp, 4 * 16);
 3992     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3993     __ sub(sp, sp, 4 * 16);
 3994     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3995 
 3996     // __ andr(len, len, -512);
 3997     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3998     __ str(len, __ pre(sp, -2 * wordSize));
 3999 
 4000     Label DONE;
 4001     __ cbz(len, DONE);
 4002 
 4003     // Compute #rounds for AES based on the length of the key array
 4004     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 4005 
 4006     __ aesenc_loadkeys(key, keylen);
 4007     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 4008     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 4009 
 4010     // AES/CTR loop
 4011     {
 4012       Label L_CTR_loop;
 4013       __ BIND(L_CTR_loop);
 4014 
 4015       // Setup the counters
 4016       __ movi(v8, __ T4S, 0);
 4017       __ movi(v9, __ T4S, 1);
 4018       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 4019 
 4020       assert(v0->encoding() < v8->encoding(), "");
 4021       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 4022         FloatRegister f = as_FloatRegister(i);
 4023         __ rev32(f, __ T16B, v16);
 4024         __ addv(v16, __ T4S, v16, v8);
 4025       }
 4026 
 4027       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 4028 
 4029       // Encrypt the counters
 4030       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 4031 
 4032       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 4033 
 4034       // XOR the encrypted counters with the inputs
 4035       for (int i = 0; i < 8; i++) {
 4036         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 4037         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 4038         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 4039       }
 4040       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 4041       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 4042 
 4043       __ subw(len, len, 16 * 8);
 4044       __ cbnzw(len, L_CTR_loop);
 4045     }
 4046 
 4047     __ rev32(v16, __ T16B, v16);
 4048     __ st1(v16, __ T16B, counter);
 4049 
 4050     __ ldr(len, Address(sp));
 4051     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 4052 
 4053     // GHASH/CTR loop
 4054     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 4055                                 len, /*unrolls*/4);
 4056 
 4057 #ifdef ASSERT
 4058     { Label L;
 4059       __ cmp(len, (unsigned char)0);
 4060       __ br(Assembler::EQ, L);
 4061       __ stop("stubGenerator: abort");
 4062       __ bind(L);
 4063   }
 4064 #endif
 4065 
 4066   __ bind(DONE);
 4067     // Return the number of bytes processed
 4068     __ ldr(r0, __ post(sp, 2 * wordSize));
 4069 
 4070     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 4071     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 4072 
 4073     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4074     __ ret(lr);
 4075 
 4076     // bind label and generate polynomial data
 4077     __ align(wordSize * 2);
 4078     __ bind(ghash_polynomial);
 4079     __ emit_int64(0x87);  // The low-order bits of the field
 4080                           // polynomial (i.e. p = z^7+z^2+z+1)
 4081                           // repeated in the low and high parts of a
 4082                           // 128-bit vector
 4083     __ emit_int64(0x87);
 4084 
 4085     // record the stub entry and end
 4086     store_archive_data(stub_id, start, __ pc());
 4087 
 4088     return start;
 4089   }
 4090 
 4091   class Cached64Bytes {
 4092   private:
 4093     MacroAssembler *_masm;
 4094     Register _regs[8];
 4095 
 4096   public:
 4097     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 4098       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 4099       auto it = rs.begin();
 4100       for (auto &r: _regs) {
 4101         r = *it;
 4102         ++it;
 4103       }
 4104     }
 4105 
 4106     void gen_loads(Register base) {
 4107       for (int i = 0; i < 8; i += 2) {
 4108         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 4109       }
 4110     }
 4111 
 4112     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 4113     void extract_u32(Register dest, int i) {
 4114       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 4115     }
 4116   };
 4117 
 4118   // Utility routines for md5.
 4119   // Clobbers r10 and r11.
 4120   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4121               int k, int s, int t) {
 4122     Register rscratch3 = r10;
 4123     Register rscratch4 = r11;
 4124 
 4125     __ eorw(rscratch3, r3, r4);
 4126     __ movw(rscratch2, t);
 4127     __ andw(rscratch3, rscratch3, r2);
 4128     __ addw(rscratch4, r1, rscratch2);
 4129     reg_cache.extract_u32(rscratch1, k);
 4130     __ eorw(rscratch3, rscratch3, r4);
 4131     __ addw(rscratch4, rscratch4, rscratch1);
 4132     __ addw(rscratch3, rscratch3, rscratch4);
 4133     __ rorw(rscratch2, rscratch3, 32 - s);
 4134     __ addw(r1, rscratch2, r2);
 4135   }
 4136 
 4137   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4138               int k, int s, int t) {
 4139     Register rscratch3 = r10;
 4140     Register rscratch4 = r11;
 4141 
 4142     reg_cache.extract_u32(rscratch1, k);
 4143     __ movw(rscratch2, t);
 4144     __ addw(rscratch4, r1, rscratch2);
 4145     __ addw(rscratch4, rscratch4, rscratch1);
 4146     __ bicw(rscratch2, r3, r4);
 4147     __ andw(rscratch3, r2, r4);
 4148     __ addw(rscratch2, rscratch2, rscratch4);
 4149     __ addw(rscratch2, rscratch2, rscratch3);
 4150     __ rorw(rscratch2, rscratch2, 32 - s);
 4151     __ addw(r1, rscratch2, r2);
 4152   }
 4153 
 4154   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4155               int k, int s, int t) {
 4156     Register rscratch3 = r10;
 4157     Register rscratch4 = r11;
 4158 
 4159     __ eorw(rscratch3, r3, r4);
 4160     __ movw(rscratch2, t);
 4161     __ addw(rscratch4, r1, rscratch2);
 4162     reg_cache.extract_u32(rscratch1, k);
 4163     __ eorw(rscratch3, rscratch3, r2);
 4164     __ addw(rscratch4, rscratch4, rscratch1);
 4165     __ addw(rscratch3, rscratch3, rscratch4);
 4166     __ rorw(rscratch2, rscratch3, 32 - s);
 4167     __ addw(r1, rscratch2, r2);
 4168   }
 4169 
 4170   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4171               int k, int s, int t) {
 4172     Register rscratch3 = r10;
 4173     Register rscratch4 = r11;
 4174 
 4175     __ movw(rscratch3, t);
 4176     __ ornw(rscratch2, r2, r4);
 4177     __ addw(rscratch4, r1, rscratch3);
 4178     reg_cache.extract_u32(rscratch1, k);
 4179     __ eorw(rscratch3, rscratch2, r3);
 4180     __ addw(rscratch4, rscratch4, rscratch1);
 4181     __ addw(rscratch3, rscratch3, rscratch4);
 4182     __ rorw(rscratch2, rscratch3, 32 - s);
 4183     __ addw(r1, rscratch2, r2);
 4184   }
 4185 
 4186   // Arguments:
 4187   //
 4188   // Inputs:
 4189   //   c_rarg0   - byte[]  source+offset
 4190   //   c_rarg1   - int[]   SHA.state
 4191   //   c_rarg2   - int     offset
 4192   //   c_rarg3   - int     limit
 4193   //
 4194   address generate_md5_implCompress(StubId stub_id) {
 4195     bool multi_block;
 4196     switch (stub_id) {
 4197     case StubId::stubgen_md5_implCompress_id:
 4198       multi_block = false;
 4199       break;
 4200     case StubId::stubgen_md5_implCompressMB_id:
 4201       multi_block = true;
 4202       break;
 4203     default:
 4204       ShouldNotReachHere();
 4205     }
 4206     int entry_count = StubInfo::entry_count(stub_id);
 4207     assert(entry_count == 1, "sanity check");
 4208     address start = load_archive_data(stub_id);
 4209     if (start != nullptr) {
 4210       return start;
 4211     }
 4212     __ align(CodeEntryAlignment);
 4213 
 4214     StubCodeMark mark(this, stub_id);
 4215     start = __ pc();
 4216 
 4217     Register buf       = c_rarg0;
 4218     Register state     = c_rarg1;
 4219     Register ofs       = c_rarg2;
 4220     Register limit     = c_rarg3;
 4221     Register a         = r4;
 4222     Register b         = r5;
 4223     Register c         = r6;
 4224     Register d         = r7;
 4225     Register rscratch3 = r10;
 4226     Register rscratch4 = r11;
 4227 
 4228     Register state_regs[2] = { r12, r13 };
 4229     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 4230     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 4231 
 4232     __ push(saved_regs, sp);
 4233 
 4234     __ ldp(state_regs[0], state_regs[1], Address(state));
 4235     __ ubfx(a, state_regs[0],  0, 32);
 4236     __ ubfx(b, state_regs[0], 32, 32);
 4237     __ ubfx(c, state_regs[1],  0, 32);
 4238     __ ubfx(d, state_regs[1], 32, 32);
 4239 
 4240     Label md5_loop;
 4241     __ BIND(md5_loop);
 4242 
 4243     reg_cache.gen_loads(buf);
 4244 
 4245     // Round 1
 4246     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 4247     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 4248     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 4249     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 4250     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 4251     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 4252     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 4253     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 4254     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 4255     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 4256     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 4257     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 4258     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 4259     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 4260     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 4261     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 4262 
 4263     // Round 2
 4264     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 4265     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 4266     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 4267     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 4268     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 4269     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 4270     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 4271     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 4272     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 4273     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 4274     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 4275     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 4276     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 4277     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 4278     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 4279     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 4280 
 4281     // Round 3
 4282     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 4283     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 4284     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 4285     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 4286     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 4287     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 4288     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 4289     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 4290     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 4291     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 4292     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 4293     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 4294     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 4295     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 4296     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 4297     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 4298 
 4299     // Round 4
 4300     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 4301     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 4302     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 4303     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 4304     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 4305     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 4306     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 4307     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 4308     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 4309     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 4310     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 4311     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 4312     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 4313     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 4314     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 4315     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 4316 
 4317     __ addw(a, state_regs[0], a);
 4318     __ ubfx(rscratch2, state_regs[0], 32, 32);
 4319     __ addw(b, rscratch2, b);
 4320     __ addw(c, state_regs[1], c);
 4321     __ ubfx(rscratch4, state_regs[1], 32, 32);
 4322     __ addw(d, rscratch4, d);
 4323 
 4324     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 4325     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 4326 
 4327     if (multi_block) {
 4328       __ add(buf, buf, 64);
 4329       __ add(ofs, ofs, 64);
 4330       __ cmp(ofs, limit);
 4331       __ br(Assembler::LE, md5_loop);
 4332       __ mov(c_rarg0, ofs); // return ofs
 4333     }
 4334 
 4335     // write hash values back in the correct order
 4336     __ stp(state_regs[0], state_regs[1], Address(state));
 4337 
 4338     __ pop(saved_regs, sp);
 4339 
 4340     __ ret(lr);
 4341 
 4342     // record the stub entry and end
 4343     store_archive_data(stub_id, start, __ pc());
 4344 
 4345     return start;
 4346   }
 4347 
 4348   // Arguments:
 4349   //
 4350   // Inputs:
 4351   //   c_rarg0   - byte[]  source+offset
 4352   //   c_rarg1   - int[]   SHA.state
 4353   //   c_rarg2   - int     offset
 4354   //   c_rarg3   - int     limit
 4355   //
 4356   address generate_sha1_implCompress(StubId stub_id) {
 4357     bool multi_block;
 4358     switch (stub_id) {
 4359     case StubId::stubgen_sha1_implCompress_id:
 4360       multi_block = false;
 4361       break;
 4362     case StubId::stubgen_sha1_implCompressMB_id:
 4363       multi_block = true;
 4364       break;
 4365     default:
 4366       ShouldNotReachHere();
 4367     }
 4368     int entry_count = StubInfo::entry_count(stub_id);
 4369     assert(entry_count == 1, "sanity check");
 4370     address start = load_archive_data(stub_id);
 4371     if (start != nullptr) {
 4372       return start;
 4373     }
 4374     __ align(CodeEntryAlignment);
 4375 
 4376     StubCodeMark mark(this, stub_id);
 4377     start = __ pc();
 4378 
 4379     Register buf   = c_rarg0;
 4380     Register state = c_rarg1;
 4381     Register ofs   = c_rarg2;
 4382     Register limit = c_rarg3;
 4383 
 4384     Label keys;
 4385     Label sha1_loop;
 4386 
 4387     // load the keys into v0..v3
 4388     __ adr(rscratch1, keys);
 4389     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 4390     // load 5 words state into v6, v7
 4391     __ ldrq(v6, Address(state, 0));
 4392     __ ldrs(v7, Address(state, 16));
 4393 
 4394 
 4395     __ BIND(sha1_loop);
 4396     // load 64 bytes of data into v16..v19
 4397     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4398     __ rev32(v16, __ T16B, v16);
 4399     __ rev32(v17, __ T16B, v17);
 4400     __ rev32(v18, __ T16B, v18);
 4401     __ rev32(v19, __ T16B, v19);
 4402 
 4403     // do the sha1
 4404     __ addv(v4, __ T4S, v16, v0);
 4405     __ orr(v20, __ T16B, v6, v6);
 4406 
 4407     FloatRegister d0 = v16;
 4408     FloatRegister d1 = v17;
 4409     FloatRegister d2 = v18;
 4410     FloatRegister d3 = v19;
 4411 
 4412     for (int round = 0; round < 20; round++) {
 4413       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 4414       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 4415       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 4416       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 4417       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 4418 
 4419       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 4420       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 4421       __ sha1h(tmp2, __ T4S, v20);
 4422       if (round < 5)
 4423         __ sha1c(v20, __ T4S, tmp3, tmp4);
 4424       else if (round < 10 || round >= 15)
 4425         __ sha1p(v20, __ T4S, tmp3, tmp4);
 4426       else
 4427         __ sha1m(v20, __ T4S, tmp3, tmp4);
 4428       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 4429 
 4430       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4431     }
 4432 
 4433     __ addv(v7, __ T2S, v7, v21);
 4434     __ addv(v6, __ T4S, v6, v20);
 4435 
 4436     if (multi_block) {
 4437       __ add(ofs, ofs, 64);
 4438       __ cmp(ofs, limit);
 4439       __ br(Assembler::LE, sha1_loop);
 4440       __ mov(c_rarg0, ofs); // return ofs
 4441     }
 4442 
 4443     __ strq(v6, Address(state, 0));
 4444     __ strs(v7, Address(state, 16));
 4445 
 4446     __ ret(lr);
 4447 
 4448     __ bind(keys);
 4449     __ emit_int32(0x5a827999);
 4450     __ emit_int32(0x6ed9eba1);
 4451     __ emit_int32(0x8f1bbcdc);
 4452     __ emit_int32(0xca62c1d6);
 4453 
 4454     // record the stub entry and end
 4455     store_archive_data(stub_id, start, __ pc());
 4456 
 4457     return start;
 4458   }
 4459 
 4460 
 4461   // Arguments:
 4462   //
 4463   // Inputs:
 4464   //   c_rarg0   - byte[]  source+offset
 4465   //   c_rarg1   - int[]   SHA.state
 4466   //   c_rarg2   - int     offset
 4467   //   c_rarg3   - int     limit
 4468   //
 4469   address generate_sha256_implCompress(StubId stub_id) {
 4470     bool multi_block;
 4471     switch (stub_id) {
 4472     case StubId::stubgen_sha256_implCompress_id:
 4473       multi_block = false;
 4474       break;
 4475     case StubId::stubgen_sha256_implCompressMB_id:
 4476       multi_block = true;
 4477       break;
 4478     default:
 4479       ShouldNotReachHere();
 4480     }
 4481     int entry_count = StubInfo::entry_count(stub_id);
 4482     assert(entry_count == 1, "sanity check");
 4483     address start = load_archive_data(stub_id);
 4484     if (start != nullptr) {
 4485       return start;
 4486     }
 4487     __ align(CodeEntryAlignment);
 4488     StubCodeMark mark(this, stub_id);
 4489     start = __ pc();
 4490 
 4491     Register buf   = c_rarg0;
 4492     Register state = c_rarg1;
 4493     Register ofs   = c_rarg2;
 4494     Register limit = c_rarg3;
 4495 
 4496     Label sha1_loop;
 4497 
 4498     __ stpd(v8, v9, __ pre(sp, -32));
 4499     __ stpd(v10, v11, Address(sp, 16));
 4500 
 4501 // dga == v0
 4502 // dgb == v1
 4503 // dg0 == v2
 4504 // dg1 == v3
 4505 // dg2 == v4
 4506 // t0 == v6
 4507 // t1 == v7
 4508 
 4509     // load 16 keys to v16..v31
 4510     __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
 4511     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 4512     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 4513     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 4514     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 4515 
 4516     // load 8 words (256 bits) state
 4517     __ ldpq(v0, v1, state);
 4518 
 4519     __ BIND(sha1_loop);
 4520     // load 64 bytes of data into v8..v11
 4521     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4522     __ rev32(v8, __ T16B, v8);
 4523     __ rev32(v9, __ T16B, v9);
 4524     __ rev32(v10, __ T16B, v10);
 4525     __ rev32(v11, __ T16B, v11);
 4526 
 4527     __ addv(v6, __ T4S, v8, v16);
 4528     __ orr(v2, __ T16B, v0, v0);
 4529     __ orr(v3, __ T16B, v1, v1);
 4530 
 4531     FloatRegister d0 = v8;
 4532     FloatRegister d1 = v9;
 4533     FloatRegister d2 = v10;
 4534     FloatRegister d3 = v11;
 4535 
 4536 
 4537     for (int round = 0; round < 16; round++) {
 4538       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4539       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4540       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4541       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4542 
 4543       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4544        __ orr(v4, __ T16B, v2, v2);
 4545       if (round < 15)
 4546         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4547       __ sha256h(v2, __ T4S, v3, tmp2);
 4548       __ sha256h2(v3, __ T4S, v4, tmp2);
 4549       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4550 
 4551       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4552     }
 4553 
 4554     __ addv(v0, __ T4S, v0, v2);
 4555     __ addv(v1, __ T4S, v1, v3);
 4556 
 4557     if (multi_block) {
 4558       __ add(ofs, ofs, 64);
 4559       __ cmp(ofs, limit);
 4560       __ br(Assembler::LE, sha1_loop);
 4561       __ mov(c_rarg0, ofs); // return ofs
 4562     }
 4563 
 4564     __ ldpd(v10, v11, Address(sp, 16));
 4565     __ ldpd(v8, v9, __ post(sp, 32));
 4566 
 4567     __ stpq(v0, v1, state);
 4568 
 4569     __ ret(lr);
 4570 
 4571     // record the stub entry and end
 4572     store_archive_data(stub_id, start, __ pc());
 4573 
 4574     return start;
 4575   }
 4576 
 4577   // Double rounds for sha512.
 4578   void sha512_dround(int dr,
 4579                      FloatRegister vi0, FloatRegister vi1,
 4580                      FloatRegister vi2, FloatRegister vi3,
 4581                      FloatRegister vi4, FloatRegister vrc0,
 4582                      FloatRegister vrc1, FloatRegister vin0,
 4583                      FloatRegister vin1, FloatRegister vin2,
 4584                      FloatRegister vin3, FloatRegister vin4) {
 4585       if (dr < 36) {
 4586         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4587       }
 4588       __ addv(v5, __ T2D, vrc0, vin0);
 4589       __ ext(v6, __ T16B, vi2, vi3, 8);
 4590       __ ext(v5, __ T16B, v5, v5, 8);
 4591       __ ext(v7, __ T16B, vi1, vi2, 8);
 4592       __ addv(vi3, __ T2D, vi3, v5);
 4593       if (dr < 32) {
 4594         __ ext(v5, __ T16B, vin3, vin4, 8);
 4595         __ sha512su0(vin0, __ T2D, vin1);
 4596       }
 4597       __ sha512h(vi3, __ T2D, v6, v7);
 4598       if (dr < 32) {
 4599         __ sha512su1(vin0, __ T2D, vin2, v5);
 4600       }
 4601       __ addv(vi4, __ T2D, vi1, vi3);
 4602       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4603   }
 4604 
 4605   // Arguments:
 4606   //
 4607   // Inputs:
 4608   //   c_rarg0   - byte[]  source+offset
 4609   //   c_rarg1   - int[]   SHA.state
 4610   //   c_rarg2   - int     offset
 4611   //   c_rarg3   - int     limit
 4612   //
 4613   address generate_sha512_implCompress(StubId stub_id) {
 4614     bool multi_block;
 4615     switch (stub_id) {
 4616     case StubId::stubgen_sha512_implCompress_id:
 4617       multi_block = false;
 4618       break;
 4619     case StubId::stubgen_sha512_implCompressMB_id:
 4620       multi_block = true;
 4621       break;
 4622     default:
 4623       ShouldNotReachHere();
 4624     }
 4625     int entry_count = StubInfo::entry_count(stub_id);
 4626     assert(entry_count == 1, "sanity check");
 4627     address start = load_archive_data(stub_id);
 4628     if (start != nullptr) {
 4629       return start;
 4630     }
 4631     __ align(CodeEntryAlignment);
 4632     StubCodeMark mark(this, stub_id);
 4633     start = __ pc();
 4634 
 4635     Register buf   = c_rarg0;
 4636     Register state = c_rarg1;
 4637     Register ofs   = c_rarg2;
 4638     Register limit = c_rarg3;
 4639 
 4640     __ stpd(v8, v9, __ pre(sp, -64));
 4641     __ stpd(v10, v11, Address(sp, 16));
 4642     __ stpd(v12, v13, Address(sp, 32));
 4643     __ stpd(v14, v15, Address(sp, 48));
 4644 
 4645     Label sha512_loop;
 4646 
 4647     // load state
 4648     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4649 
 4650     // load first 4 round constants
 4651     __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
 4652     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4653 
 4654     __ BIND(sha512_loop);
 4655     // load 128B of data into v12..v19
 4656     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4657     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4658     __ rev64(v12, __ T16B, v12);
 4659     __ rev64(v13, __ T16B, v13);
 4660     __ rev64(v14, __ T16B, v14);
 4661     __ rev64(v15, __ T16B, v15);
 4662     __ rev64(v16, __ T16B, v16);
 4663     __ rev64(v17, __ T16B, v17);
 4664     __ rev64(v18, __ T16B, v18);
 4665     __ rev64(v19, __ T16B, v19);
 4666 
 4667     __ mov(rscratch2, rscratch1);
 4668 
 4669     __ mov(v0, __ T16B, v8);
 4670     __ mov(v1, __ T16B, v9);
 4671     __ mov(v2, __ T16B, v10);
 4672     __ mov(v3, __ T16B, v11);
 4673 
 4674     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4675     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4676     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4677     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4678     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4679     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4680     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4681     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4682     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4683     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4684     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4685     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4686     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4687     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4688     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4689     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4690     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4691     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4692     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4693     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4694     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4695     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4696     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4697     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4698     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4699     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4700     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4701     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4702     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4703     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4704     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4705     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4706     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4707     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4708     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4709     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4710     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4711     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4712     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4713     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4714 
 4715     __ addv(v8, __ T2D, v8, v0);
 4716     __ addv(v9, __ T2D, v9, v1);
 4717     __ addv(v10, __ T2D, v10, v2);
 4718     __ addv(v11, __ T2D, v11, v3);
 4719 
 4720     if (multi_block) {
 4721       __ add(ofs, ofs, 128);
 4722       __ cmp(ofs, limit);
 4723       __ br(Assembler::LE, sha512_loop);
 4724       __ mov(c_rarg0, ofs); // return ofs
 4725     }
 4726 
 4727     __ st1(v8, v9, v10, v11, __ T2D, state);
 4728 
 4729     __ ldpd(v14, v15, Address(sp, 48));
 4730     __ ldpd(v12, v13, Address(sp, 32));
 4731     __ ldpd(v10, v11, Address(sp, 16));
 4732     __ ldpd(v8, v9, __ post(sp, 64));
 4733 
 4734     __ ret(lr);
 4735 
 4736     // record the stub entry and end
 4737     store_archive_data(stub_id, start, __ pc());
 4738 
 4739     return start;
 4740   }
 4741 
 4742   // Execute one round of keccak of two computations in parallel.
 4743   // One of the states should be loaded into the lower halves of
 4744   // the vector registers v0-v24, the other should be loaded into
 4745   // the upper halves of those registers. The ld1r instruction loads
 4746   // the round constant into both halves of register v31.
 4747   // Intermediate results c0...c5 and d0...d5 are computed
 4748   // in registers v25...v30.
 4749   // All vector instructions that are used operate on both register
 4750   // halves in parallel.
 4751   // If only a single computation is needed, one can only load the lower halves.
 4752   void keccak_round(Register rscratch1) {
 4753   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4754   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4755   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4756   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4757   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4758   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4759   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4760   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4761   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4762   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4763 
 4764   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4765   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4766   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4767   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4768   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4769 
 4770   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4771   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4772   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4773   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4774   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4775   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4776   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4777   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4778   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4779   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4780   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4781   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4782   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4783   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4784   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4785   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4786   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4787   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4788   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4789   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4790   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4791   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4792   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4793   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4794   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4795 
 4796   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4797   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4798   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4799   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4800   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4801 
 4802   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4803 
 4804   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4805   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4806   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4807   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4808   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4809 
 4810   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4811   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4812   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4813   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4814   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4815 
 4816   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4817   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4818   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4819   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4820   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4821 
 4822   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4823   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4824   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4825   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4826   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4827 
 4828   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4829   }
 4830 
 4831   // Arguments:
 4832   //
 4833   // Inputs:
 4834   //   c_rarg0   - byte[]  source+offset
 4835   //   c_rarg1   - byte[]  SHA.state
 4836   //   c_rarg2   - int     block_size
 4837   //   c_rarg3   - int     offset
 4838   //   c_rarg4   - int     limit
 4839   //
 4840   address generate_sha3_implCompress(StubId stub_id) {
 4841     bool multi_block;
 4842     switch (stub_id) {
 4843     case StubId::stubgen_sha3_implCompress_id:
 4844       multi_block = false;
 4845       break;
 4846     case StubId::stubgen_sha3_implCompressMB_id:
 4847       multi_block = true;
 4848       break;
 4849     default:
 4850       ShouldNotReachHere();
 4851     }
 4852     int entry_count = StubInfo::entry_count(stub_id);
 4853     assert(entry_count == 1, "sanity check");
 4854     address start = load_archive_data(stub_id);
 4855     if (start != nullptr) {
 4856       return start;
 4857     }
 4858     __ align(CodeEntryAlignment);
 4859     StubCodeMark mark(this, stub_id);
 4860     start = __ pc();
 4861 
 4862     Register buf           = c_rarg0;
 4863     Register state         = c_rarg1;
 4864     Register block_size    = c_rarg2;
 4865     Register ofs           = c_rarg3;
 4866     Register limit         = c_rarg4;
 4867 
 4868     Label sha3_loop, rounds24_loop;
 4869     Label sha3_512_or_sha3_384, shake128;
 4870 
 4871     __ stpd(v8, v9, __ pre(sp, -64));
 4872     __ stpd(v10, v11, Address(sp, 16));
 4873     __ stpd(v12, v13, Address(sp, 32));
 4874     __ stpd(v14, v15, Address(sp, 48));
 4875 
 4876     // load state
 4877     __ add(rscratch1, state, 32);
 4878     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4879     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4880     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4881     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4882     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4883     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4884     __ ld1(v24, __ T1D, rscratch1);
 4885 
 4886     __ BIND(sha3_loop);
 4887 
 4888     // 24 keccak rounds
 4889     __ movw(rscratch2, 24);
 4890 
 4891     // load round_constants base
 4892     __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
 4893 
 4894     // load input
 4895     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4896     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4897     __ eor(v0, __ T8B, v0, v25);
 4898     __ eor(v1, __ T8B, v1, v26);
 4899     __ eor(v2, __ T8B, v2, v27);
 4900     __ eor(v3, __ T8B, v3, v28);
 4901     __ eor(v4, __ T8B, v4, v29);
 4902     __ eor(v5, __ T8B, v5, v30);
 4903     __ eor(v6, __ T8B, v6, v31);
 4904 
 4905     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4906     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4907 
 4908     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4909     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4910     __ eor(v7, __ T8B, v7, v25);
 4911     __ eor(v8, __ T8B, v8, v26);
 4912     __ eor(v9, __ T8B, v9, v27);
 4913     __ eor(v10, __ T8B, v10, v28);
 4914     __ eor(v11, __ T8B, v11, v29);
 4915     __ eor(v12, __ T8B, v12, v30);
 4916     __ eor(v13, __ T8B, v13, v31);
 4917 
 4918     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4919     __ eor(v14, __ T8B, v14, v25);
 4920     __ eor(v15, __ T8B, v15, v26);
 4921     __ eor(v16, __ T8B, v16, v27);
 4922 
 4923     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4924     __ andw(c_rarg5, block_size, 48);
 4925     __ cbzw(c_rarg5, rounds24_loop);
 4926 
 4927     __ tbnz(block_size, 5, shake128);
 4928     // block_size == 144, bit5 == 0, SHA3-224
 4929     __ ldrd(v28, __ post(buf, 8));
 4930     __ eor(v17, __ T8B, v17, v28);
 4931     __ b(rounds24_loop);
 4932 
 4933     __ BIND(shake128);
 4934     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4935     __ eor(v17, __ T8B, v17, v28);
 4936     __ eor(v18, __ T8B, v18, v29);
 4937     __ eor(v19, __ T8B, v19, v30);
 4938     __ eor(v20, __ T8B, v20, v31);
 4939     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4940 
 4941     __ BIND(sha3_512_or_sha3_384);
 4942     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4943     __ eor(v7, __ T8B, v7, v25);
 4944     __ eor(v8, __ T8B, v8, v26);
 4945     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4946 
 4947     // SHA3-384
 4948     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4949     __ eor(v9,  __ T8B, v9,  v27);
 4950     __ eor(v10, __ T8B, v10, v28);
 4951     __ eor(v11, __ T8B, v11, v29);
 4952     __ eor(v12, __ T8B, v12, v30);
 4953 
 4954     __ BIND(rounds24_loop);
 4955     __ subw(rscratch2, rscratch2, 1);
 4956 
 4957     keccak_round(rscratch1);
 4958 
 4959     __ cbnzw(rscratch2, rounds24_loop);
 4960 
 4961     if (multi_block) {
 4962       __ add(ofs, ofs, block_size);
 4963       __ cmp(ofs, limit);
 4964       __ br(Assembler::LE, sha3_loop);
 4965       __ mov(c_rarg0, ofs); // return ofs
 4966     }
 4967 
 4968     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4969     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4970     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4971     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4972     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4973     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4974     __ st1(v24, __ T1D, state);
 4975 
 4976     // restore callee-saved registers
 4977     __ ldpd(v14, v15, Address(sp, 48));
 4978     __ ldpd(v12, v13, Address(sp, 32));
 4979     __ ldpd(v10, v11, Address(sp, 16));
 4980     __ ldpd(v8, v9, __ post(sp, 64));
 4981 
 4982     __ ret(lr);
 4983 
 4984     // record the stub entry and end
 4985     store_archive_data(stub_id, start, __ pc());
 4986 
 4987     return start;
 4988   }
 4989 
 4990   // Inputs:
 4991   //   c_rarg0   - long[]  state0
 4992   //   c_rarg1   - long[]  state1
 4993   address generate_double_keccak() {
 4994     StubId stub_id = StubId::stubgen_double_keccak_id;
 4995     int entry_count = StubInfo::entry_count(stub_id);
 4996     assert(entry_count == 1, "sanity check");
 4997     address start = load_archive_data(stub_id);
 4998     if (start != nullptr) {
 4999       return start;
 5000     }
 5001     // Implements the double_keccak() method of the
 5002     // sun.secyrity.provider.SHA3Parallel class
 5003     __ align(CodeEntryAlignment);
 5004     StubCodeMark mark(this, stub_id);
 5005     start = __ pc();
 5006     __ enter();
 5007 
 5008     Register state0        = c_rarg0;
 5009     Register state1        = c_rarg1;
 5010 
 5011     Label rounds24_loop;
 5012 
 5013     // save callee-saved registers
 5014     __ stpd(v8, v9, __ pre(sp, -64));
 5015     __ stpd(v10, v11, Address(sp, 16));
 5016     __ stpd(v12, v13, Address(sp, 32));
 5017     __ stpd(v14, v15, Address(sp, 48));
 5018 
 5019     // load states
 5020     __ add(rscratch1, state0, 32);
 5021     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 5022     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 5023     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 5024     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 5025     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 5026     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 5027     __ ld1(v24, __ D, 0, rscratch1);
 5028     __ add(rscratch1, state1, 32);
 5029     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 5030     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 5031     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 5032     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 5033     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 5034     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 5035     __ ld1(v24, __ D, 1, rscratch1);
 5036 
 5037     // 24 keccak rounds
 5038     __ movw(rscratch2, 24);
 5039 
 5040     // load round_constants base
 5041     __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
 5042 
 5043     __ BIND(rounds24_loop);
 5044     __ subw(rscratch2, rscratch2, 1);
 5045     keccak_round(rscratch1);
 5046     __ cbnzw(rscratch2, rounds24_loop);
 5047 
 5048     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 5049     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 5050     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 5051     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 5052     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 5053     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 5054     __ st1(v24, __ D, 0, state0);
 5055     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 5056     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 5057     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 5058     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 5059     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 5060     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 5061     __ st1(v24, __ D, 1, state1);
 5062 
 5063     // restore callee-saved vector registers
 5064     __ ldpd(v14, v15, Address(sp, 48));
 5065     __ ldpd(v12, v13, Address(sp, 32));
 5066     __ ldpd(v10, v11, Address(sp, 16));
 5067     __ ldpd(v8, v9, __ post(sp, 64));
 5068 
 5069     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5070     __ mov(r0, zr); // return 0
 5071     __ ret(lr);
 5072 
 5073     // record the stub entry and end
 5074     store_archive_data(stub_id, start, __ pc());
 5075 
 5076     return start;
 5077   }
 5078 
 5079   // ChaCha20 block function.  This version parallelizes the 32-bit
 5080   // state elements on each of 16 vectors, producing 4 blocks of
 5081   // keystream at a time.
 5082   //
 5083   // state (int[16]) = c_rarg0
 5084   // keystream (byte[256]) = c_rarg1
 5085   // return - number of bytes of produced keystream (always 256)
 5086   //
 5087   // This implementation takes each 32-bit integer from the state
 5088   // array and broadcasts it across all 4 32-bit lanes of a vector register
 5089   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 5090   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 5091   // the quarter round schedule is implemented as outlined in RFC 7539 section
 5092   // 2.3.  However, instead of sequentially processing the 3 quarter round
 5093   // operations represented by one QUARTERROUND function, we instead stack all
 5094   // the adds, xors and left-rotations from the first 4 quarter rounds together
 5095   // and then do the same for the second set of 4 quarter rounds.  This removes
 5096   // some latency that would otherwise be incurred by waiting for an add to
 5097   // complete before performing an xor (which depends on the result of the
 5098   // add), etc. An adjustment happens between the first and second groups of 4
 5099   // quarter rounds, but this is done only in the inputs to the macro functions
 5100   // that generate the assembly instructions - these adjustments themselves are
 5101   // not part of the resulting assembly.
 5102   // The 4 registers v0-v3 are used during the quarter round operations as
 5103   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 5104   // registers become the vectors involved in adding the start state back onto
 5105   // the post-QR working state.  After the adds are complete, each of the 16
 5106   // vectors write their first lane back to the keystream buffer, followed
 5107   // by the second lane from all vectors and so on.
 5108   address generate_chacha20Block_blockpar() {
 5109     StubId stub_id = StubId::stubgen_chacha20Block_id;
 5110     int entry_count = StubInfo::entry_count(stub_id);
 5111     assert(entry_count == 1, "sanity check");
 5112     address start = load_archive_data(stub_id);
 5113     if (start != nullptr) {
 5114       return start;
 5115     }
 5116     Label L_twoRounds, L_cc20_const;
 5117     __ align(CodeEntryAlignment);
 5118     StubCodeMark mark(this, stub_id);
 5119     start = __ pc();
 5120     __ enter();
 5121 
 5122     int i, j;
 5123     const Register state = c_rarg0;
 5124     const Register keystream = c_rarg1;
 5125     const Register loopCtr = r10;
 5126     const Register tmpAddr = r11;
 5127     const FloatRegister ctrAddOverlay = v28;
 5128     const FloatRegister lrot8Tbl = v29;
 5129 
 5130     // Organize SIMD registers in an array that facilitates
 5131     // putting repetitive opcodes into loop structures.  It is
 5132     // important that each grouping of 4 registers is monotonically
 5133     // increasing to support the requirements of multi-register
 5134     // instructions (e.g. ld4r, st4, etc.)
 5135     const FloatRegister workSt[16] = {
 5136          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 5137         v20, v21, v22, v23, v24, v25, v26, v27
 5138     };
 5139 
 5140     // Pull in constant data.  The first 16 bytes are the add overlay
 5141     // which is applied to the vector holding the counter (state[12]).
 5142     // The second 16 bytes is the index register for the 8-bit left
 5143     // rotation tbl instruction.
 5144     __ adr(tmpAddr, L_cc20_const);
 5145     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 5146 
 5147     // Load from memory and interlace across 16 SIMD registers,
 5148     // With each word from memory being broadcast to all lanes of
 5149     // each successive SIMD register.
 5150     //      Addr(0) -> All lanes in workSt[i]
 5151     //      Addr(4) -> All lanes workSt[i + 1], etc.
 5152     __ mov(tmpAddr, state);
 5153     for (i = 0; i < 16; i += 4) {
 5154       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 5155           __ post(tmpAddr, 16));
 5156     }
 5157     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5158 
 5159     // Before entering the loop, create 5 4-register arrays.  These
 5160     // will hold the 4 registers that represent the a/b/c/d fields
 5161     // in the quarter round operation.  For instance the "b" field
 5162     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 5163     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 5164     // since it is part of a diagonal organization.  The aSet and scratch
 5165     // register sets are defined at declaration time because they do not change
 5166     // organization at any point during the 20-round processing.
 5167     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 5168     FloatRegister bSet[4];
 5169     FloatRegister cSet[4];
 5170     FloatRegister dSet[4];
 5171     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 5172 
 5173     // Set up the 10 iteration loop and perform all 8 quarter round ops
 5174     __ mov(loopCtr, 10);
 5175     __ BIND(L_twoRounds);
 5176 
 5177     // Set to columnar organization and do the following 4 quarter-rounds:
 5178     // QUARTERROUND(0, 4, 8, 12)
 5179     // QUARTERROUND(1, 5, 9, 13)
 5180     // QUARTERROUND(2, 6, 10, 14)
 5181     // QUARTERROUND(3, 7, 11, 15)
 5182     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 5183     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 5184     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 5185 
 5186     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5187     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5188     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5189 
 5190     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5191     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5192     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5193 
 5194     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5195     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5196     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5197 
 5198     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5199     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5200     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5201 
 5202     // Set to diagonal organization and do the next 4 quarter-rounds:
 5203     // QUARTERROUND(0, 5, 10, 15)
 5204     // QUARTERROUND(1, 6, 11, 12)
 5205     // QUARTERROUND(2, 7, 8, 13)
 5206     // QUARTERROUND(3, 4, 9, 14)
 5207     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 5208     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 5209     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 5210 
 5211     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5212     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5213     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5214 
 5215     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5216     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5217     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5218 
 5219     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5220     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5221     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5222 
 5223     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5224     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5225     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5226 
 5227     // Decrement and iterate
 5228     __ sub(loopCtr, loopCtr, 1);
 5229     __ cbnz(loopCtr, L_twoRounds);
 5230 
 5231     __ mov(tmpAddr, state);
 5232 
 5233     // Add the starting state back to the post-loop keystream
 5234     // state.  We read/interlace the state array from memory into
 5235     // 4 registers similar to what we did in the beginning.  Then
 5236     // add the counter overlay onto workSt[12] at the end.
 5237     for (i = 0; i < 16; i += 4) {
 5238       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 5239       __ addv(workSt[i], __ T4S, workSt[i], v0);
 5240       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 5241       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 5242       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 5243     }
 5244     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5245 
 5246     // Write working state into the keystream buffer.  This is accomplished
 5247     // by taking the lane "i" from each of the four vectors and writing
 5248     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 5249     // repeating with the next 4 vectors until all 16 vectors have been used.
 5250     // Then move to the next lane and repeat the process until all lanes have
 5251     // been written.
 5252     for (i = 0; i < 4; i++) {
 5253       for (j = 0; j < 16; j += 4) {
 5254         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 5255             __ post(keystream, 16));
 5256       }
 5257     }
 5258 
 5259     __ mov(r0, 256);             // Return length of output keystream
 5260     __ leave();
 5261     __ ret(lr);
 5262 
 5263     // bind label and generate local constant data used by this stub
 5264     // The constant data is broken into two 128-bit segments to be loaded
 5265     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 5266     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 5267     // The second 128-bits is a table constant used for 8-bit left rotations.
 5268     __ BIND(L_cc20_const);
 5269     __ emit_int64(0x0000000100000000UL);
 5270     __ emit_int64(0x0000000300000002UL);
 5271     __ emit_int64(0x0605040702010003UL);
 5272     __ emit_int64(0x0E0D0C0F0A09080BUL);
 5273 
 5274     // record the stub entry and end
 5275     store_archive_data(stub_id, start, __ pc());
 5276 
 5277     return start;
 5278   }
 5279 
 5280   // Helpers to schedule parallel operation bundles across vector
 5281   // register sequences of size 2, 4 or 8.
 5282 
 5283   // Implement various primitive computations across vector sequences
 5284 
 5285   template<int N>
 5286   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5287                const VSeq<N>& v1, const VSeq<N>& v2) {
 5288     // output must not be constant
 5289     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5290     // output cannot overwrite pending inputs
 5291     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5292     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5293     for (int i = 0; i < N; i++) {
 5294       __ addv(v[i], T, v1[i], v2[i]);
 5295     }
 5296   }
 5297 
 5298   template<int N>
 5299   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5300                const VSeq<N>& v1, const VSeq<N>& v2) {
 5301     // output must not be constant
 5302     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5303     // output cannot overwrite pending inputs
 5304     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5305     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5306     for (int i = 0; i < N; i++) {
 5307       __ subv(v[i], T, v1[i], v2[i]);
 5308     }
 5309   }
 5310 
 5311   template<int N>
 5312   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5313                const VSeq<N>& v1, const VSeq<N>& v2) {
 5314     // output must not be constant
 5315     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5316     // output cannot overwrite pending inputs
 5317     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5318     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5319     for (int i = 0; i < N; i++) {
 5320       __ mulv(v[i], T, v1[i], v2[i]);
 5321     }
 5322   }
 5323 
 5324   template<int N>
 5325   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 5326     // output must not be constant
 5327     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5328     // output cannot overwrite pending inputs
 5329     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5330     for (int i = 0; i < N; i++) {
 5331       __ negr(v[i], T, v1[i]);
 5332     }
 5333   }
 5334 
 5335   template<int N>
 5336   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5337                const VSeq<N>& v1, int shift) {
 5338     // output must not be constant
 5339     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5340     // output cannot overwrite pending inputs
 5341     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5342     for (int i = 0; i < N; i++) {
 5343       __ sshr(v[i], T, v1[i], shift);
 5344     }
 5345   }
 5346 
 5347   template<int N>
 5348   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5349     // output must not be constant
 5350     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5351     // output cannot overwrite pending inputs
 5352     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5353     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5354     for (int i = 0; i < N; i++) {
 5355       __ andr(v[i], __ T16B, v1[i], v2[i]);
 5356     }
 5357   }
 5358 
 5359   template<int N>
 5360   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5361     // output must not be constant
 5362     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5363     // output cannot overwrite pending inputs
 5364     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5365     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5366     for (int i = 0; i < N; i++) {
 5367       __ orr(v[i], __ T16B, v1[i], v2[i]);
 5368     }
 5369   }
 5370 
 5371   template<int N>
 5372   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 5373     // output must not be constant
 5374     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5375     // output cannot overwrite pending inputs
 5376     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5377     for (int i = 0; i < N; i++) {
 5378       __ notr(v[i], __ T16B, v1[i]);
 5379     }
 5380   }
 5381 
 5382   template<int N>
 5383   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 5384     // output must not be constant
 5385     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5386     // output cannot overwrite pending inputs
 5387     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5388     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5389     for (int i = 0; i < N; i++) {
 5390       __ sqdmulh(v[i], T, v1[i], v2[i]);
 5391     }
 5392   }
 5393 
 5394   template<int N>
 5395   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 5396     // output must not be constant
 5397     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5398     // output cannot overwrite pending inputs
 5399     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5400     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5401     for (int i = 0; i < N; i++) {
 5402       __ mlsv(v[i], T, v1[i], v2[i]);
 5403     }
 5404   }
 5405 
 5406   // load N/2 successive pairs of quadword values from memory in order
 5407   // into N successive vector registers of the sequence via the
 5408   // address supplied in base.
 5409   template<int N>
 5410   void vs_ldpq(const VSeq<N>& v, Register base) {
 5411     for (int i = 0; i < N; i += 2) {
 5412       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 5413     }
 5414   }
 5415 
 5416   // load N/2 successive pairs of quadword values from memory in order
 5417   // into N vector registers of the sequence via the address supplied
 5418   // in base using post-increment addressing
 5419   template<int N>
 5420   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 5421     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5422     for (int i = 0; i < N; i += 2) {
 5423       __ ldpq(v[i], v[i+1], __ post(base, 32));
 5424     }
 5425   }
 5426 
 5427   // store N successive vector registers of the sequence into N/2
 5428   // successive pairs of quadword memory locations via the address
 5429   // supplied in base using post-increment addressing
 5430   template<int N>
 5431   void vs_stpq_post(const VSeq<N>& v, Register base) {
 5432     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5433     for (int i = 0; i < N; i += 2) {
 5434       __ stpq(v[i], v[i+1], __ post(base, 32));
 5435     }
 5436   }
 5437 
 5438   // load N/2 pairs of quadword values from memory de-interleaved into
 5439   // N vector registers 2 at a time via the address supplied in base
 5440   // using post-increment addressing.
 5441   template<int N>
 5442   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5443     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5444     for (int i = 0; i < N; i += 2) {
 5445       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 5446     }
 5447   }
 5448 
 5449   // store N vector registers interleaved into N/2 pairs of quadword
 5450   // memory locations via the address supplied in base using
 5451   // post-increment addressing.
 5452   template<int N>
 5453   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5454     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5455     for (int i = 0; i < N; i += 2) {
 5456       __ st2(v[i], v[i+1], T, __ post(base, 32));
 5457     }
 5458   }
 5459 
 5460   // load N quadword values from memory de-interleaved into N vector
 5461   // registers 3 elements at a time via the address supplied in base.
 5462   template<int N>
 5463   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5464     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5465     for (int i = 0; i < N; i += 3) {
 5466       __ ld3(v[i], v[i+1], v[i+2], T, base);
 5467     }
 5468   }
 5469 
 5470   // load N quadword values from memory de-interleaved into N vector
 5471   // registers 3 elements at a time via the address supplied in base
 5472   // using post-increment addressing.
 5473   template<int N>
 5474   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5475     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5476     for (int i = 0; i < N; i += 3) {
 5477       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 5478     }
 5479   }
 5480 
 5481   // load N/2 pairs of quadword values from memory into N vector
 5482   // registers via the address supplied in base with each pair indexed
 5483   // using the the start offset plus the corresponding entry in the
 5484   // offsets array
 5485   template<int N>
 5486   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 5487     for (int i = 0; i < N/2; i++) {
 5488       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5489     }
 5490   }
 5491 
 5492   // store N vector registers into N/2 pairs of quadword memory
 5493   // locations via the address supplied in base with each pair indexed
 5494   // using the the start offset plus the corresponding entry in the
 5495   // offsets array
 5496   template<int N>
 5497   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 5498     for (int i = 0; i < N/2; i++) {
 5499       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5500     }
 5501   }
 5502 
 5503   // load N single quadword values from memory into N vector registers
 5504   // via the address supplied in base with each value indexed using
 5505   // the the start offset plus the corresponding entry in the offsets
 5506   // array
 5507   template<int N>
 5508   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5509                       int start, int (&offsets)[N]) {
 5510     for (int i = 0; i < N; i++) {
 5511       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5512     }
 5513   }
 5514 
 5515   // store N vector registers into N single quadword memory locations
 5516   // via the address supplied in base with each value indexed using
 5517   // the the start offset plus the corresponding entry in the offsets
 5518   // array
 5519   template<int N>
 5520   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5521                       int start, int (&offsets)[N]) {
 5522     for (int i = 0; i < N; i++) {
 5523       __ str(v[i], T, Address(base, start + offsets[i]));
 5524     }
 5525   }
 5526 
 5527   // load N/2 pairs of quadword values from memory de-interleaved into
 5528   // N vector registers 2 at a time via the address supplied in base
 5529   // with each pair indexed using the the start offset plus the
 5530   // corresponding entry in the offsets array
 5531   template<int N>
 5532   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5533                       Register tmp, int start, int (&offsets)[N/2]) {
 5534     for (int i = 0; i < N/2; i++) {
 5535       __ add(tmp, base, start + offsets[i]);
 5536       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5537     }
 5538   }
 5539 
 5540   // store N vector registers 2 at a time interleaved into N/2 pairs
 5541   // of quadword memory locations via the address supplied in base
 5542   // with each pair indexed using the the start offset plus the
 5543   // corresponding entry in the offsets array
 5544   template<int N>
 5545   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5546                       Register tmp, int start, int (&offsets)[N/2]) {
 5547     for (int i = 0; i < N/2; i++) {
 5548       __ add(tmp, base, start + offsets[i]);
 5549       __ st2(v[2*i], v[2*i+1], T, tmp);
 5550     }
 5551   }
 5552 
 5553   // Helper routines for various flavours of Montgomery multiply
 5554 
 5555   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5556   // multiplications in parallel
 5557   //
 5558 
 5559   // See the montMul() method of the sun.security.provider.ML_DSA
 5560   // class.
 5561   //
 5562   // Computes 4x4S results or 8x8H results
 5563   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5564   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5565   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5566   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5567   // Outputs: va - 4x4S or 4x8H vector register sequences
 5568   // vb, vc, vtmp and vq must all be disjoint
 5569   // va must be disjoint from all other inputs/temps or must equal vc
 5570   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5571   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5572   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5573                    Assembler::SIMD_Arrangement T,
 5574                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5575     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5576     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5577     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5578     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5579 
 5580     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5581     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5582 
 5583     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5584 
 5585     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5586     assert(vs_disjoint(va, vb), "va and vb overlap");
 5587     assert(vs_disjoint(va, vq), "va and vq overlap");
 5588     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5589     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5590 
 5591     // schedule 4 streams of instructions across the vector sequences
 5592     for (int i = 0; i < 4; i++) {
 5593       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5594       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5595     }
 5596 
 5597     for (int i = 0; i < 4; i++) {
 5598       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5599     }
 5600 
 5601     for (int i = 0; i < 4; i++) {
 5602       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5603     }
 5604 
 5605     for (int i = 0; i < 4; i++) {
 5606       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5607     }
 5608   }
 5609 
 5610   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5611   // multiplications in parallel
 5612   //
 5613 
 5614   // See the montMul() method of the sun.security.provider.ML_DSA
 5615   // class.
 5616   //
 5617   // Computes 4x4S results or 8x8H results
 5618   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5619   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5620   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5621   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5622   // Outputs: va - 4x4S or 4x8H vector register sequences
 5623   // vb, vc, vtmp and vq must all be disjoint
 5624   // va must be disjoint from all other inputs/temps or must equal vc
 5625   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5626   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5627   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5628                    Assembler::SIMD_Arrangement T,
 5629                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5630     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5631     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5632     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5633     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5634 
 5635     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5636     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5637 
 5638     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5639 
 5640     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5641     assert(vs_disjoint(va, vb), "va and vb overlap");
 5642     assert(vs_disjoint(va, vq), "va and vq overlap");
 5643     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5644     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5645 
 5646     // schedule 2 streams of instructions across the vector sequences
 5647     for (int i = 0; i < 2; i++) {
 5648       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5649       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5650     }
 5651 
 5652     for (int i = 0; i < 2; i++) {
 5653       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5654     }
 5655 
 5656     for (int i = 0; i < 2; i++) {
 5657       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5658     }
 5659 
 5660     for (int i = 0; i < 2; i++) {
 5661       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5662     }
 5663   }
 5664 
 5665   // Perform 16 16-bit Montgomery multiplications in parallel.
 5666   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5667                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5668     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5669     // It will assert that the register use is valid
 5670     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5671   }
 5672 
 5673   // Perform 32 16-bit Montgomery multiplications in parallel.
 5674   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5675                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5676     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5677     // It will assert that the register use is valid
 5678     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5679   }
 5680 
 5681   // Perform 64 16-bit Montgomery multiplications in parallel.
 5682   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5683                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5684     // Schedule two successive 4x8H multiplies via the montmul helper
 5685     // on the front and back halves of va, vb and vc. The helper will
 5686     // assert that the register use has no overlap conflicts on each
 5687     // individual call but we also need to ensure that the necessary
 5688     // disjoint/equality constraints are met across both calls.
 5689 
 5690     // vb, vc, vtmp and vq must be disjoint. va must either be
 5691     // disjoint from all other registers or equal vc
 5692 
 5693     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5694     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5695     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5696 
 5697     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5698     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5699 
 5700     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5701 
 5702     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5703     assert(vs_disjoint(va, vb), "va and vb overlap");
 5704     assert(vs_disjoint(va, vq), "va and vq overlap");
 5705     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5706 
 5707     // we multiply the front and back halves of each sequence 4 at a
 5708     // time because
 5709     //
 5710     // 1) we are currently only able to get 4-way instruction
 5711     // parallelism at best
 5712     //
 5713     // 2) we need registers for the constants in vq and temporary
 5714     // scratch registers to hold intermediate results so vtmp can only
 5715     // be a VSeq<4> which means we only have 4 scratch slots
 5716 
 5717     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5718     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5719   }
 5720 
 5721   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5722                                const VSeq<4>& vc,
 5723                                const VSeq<4>& vtmp,
 5724                                const VSeq<2>& vq) {
 5725     // compute a = montmul(a1, c)
 5726     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5727     // ouptut a1 = a0 - a
 5728     vs_subv(va1, __ T8H, va0, vc);
 5729     //    and a0 = a0 + a
 5730     vs_addv(va0, __ T8H, va0, vc);
 5731   }
 5732 
 5733   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5734                                const VSeq<4>& vb,
 5735                                const VSeq<4>& vtmp1,
 5736                                const VSeq<4>& vtmp2,
 5737                                const VSeq<2>& vq) {
 5738     // compute c = a0 - a1
 5739     vs_subv(vtmp1, __ T8H, va0, va1);
 5740     // output a0 = a0 + a1
 5741     vs_addv(va0, __ T8H, va0, va1);
 5742     // output a1 = b montmul c
 5743     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5744   }
 5745 
 5746   void load64shorts(const VSeq<8>& v, Register shorts) {
 5747     vs_ldpq_post(v, shorts);
 5748   }
 5749 
 5750   void load32shorts(const VSeq<4>& v, Register shorts) {
 5751     vs_ldpq_post(v, shorts);
 5752   }
 5753 
 5754   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5755     vs_stpq_post(v, tmpAddr);
 5756   }
 5757 
 5758   // Kyber NTT function.
 5759   // Implements
 5760   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5761   //
 5762   // coeffs (short[256]) = c_rarg0
 5763   // ntt_zetas (short[256]) = c_rarg1
 5764   address generate_kyberNtt() {
 5765     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5766     int entry_count = StubInfo::entry_count(stub_id);
 5767     assert(entry_count == 1, "sanity check");
 5768     address start = load_archive_data(stub_id);
 5769     if (start != nullptr) {
 5770       return start;
 5771     }
 5772     __ align(CodeEntryAlignment);
 5773     StubCodeMark mark(this, stub_id);
 5774     start = __ pc();
 5775     __ enter();
 5776 
 5777     const Register coeffs = c_rarg0;
 5778     const Register zetas = c_rarg1;
 5779 
 5780     const Register kyberConsts = r10;
 5781     const Register tmpAddr = r11;
 5782 
 5783     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5784     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5785     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5786 
 5787     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5788     // load the montmul constants
 5789     vs_ldpq(vq, kyberConsts);
 5790 
 5791     // Each level corresponds to an iteration of the outermost loop of the
 5792     // Java method seilerNTT(int[] coeffs). There are some differences
 5793     // from what is done in the seilerNTT() method, though:
 5794     // 1. The computation is using 16-bit signed values, we do not convert them
 5795     // to ints here.
 5796     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5797     // this array for each level, it is easier that way to fill up the vector
 5798     // registers.
 5799     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5800     // multiplications (this is because that way there should not be any
 5801     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5802     // that we can use the 16-bit arithmetic in the vector unit.
 5803     //
 5804     // On each level, we fill up the vector registers in such a way that the
 5805     // array elements that need to be multiplied by the zetas go into one
 5806     // set of vector registers while the corresponding ones that don't need to
 5807     // be multiplied, go into another set.
 5808     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5809     // registers interleaving the steps of 4 identical computations,
 5810     // each done on 8 16-bit values per register.
 5811 
 5812     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5813     // to the zetas occur in discrete blocks whose size is some multiple
 5814     // of 32.
 5815 
 5816     // level 0
 5817     __ add(tmpAddr, coeffs, 256);
 5818     load64shorts(vs1, tmpAddr);
 5819     load64shorts(vs2, zetas);
 5820     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5821     __ add(tmpAddr, coeffs, 0);
 5822     load64shorts(vs1, tmpAddr);
 5823     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5824     vs_addv(vs1, __ T8H, vs1, vs2);
 5825     __ add(tmpAddr, coeffs, 0);
 5826     vs_stpq_post(vs1, tmpAddr);
 5827     __ add(tmpAddr, coeffs, 256);
 5828     vs_stpq_post(vs3, tmpAddr);
 5829     // restore montmul constants
 5830     vs_ldpq(vq, kyberConsts);
 5831     load64shorts(vs1, tmpAddr);
 5832     load64shorts(vs2, zetas);
 5833     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5834     __ add(tmpAddr, coeffs, 128);
 5835     load64shorts(vs1, tmpAddr);
 5836     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5837     vs_addv(vs1, __ T8H, vs1, vs2);
 5838     __ add(tmpAddr, coeffs, 128);
 5839     store64shorts(vs1, tmpAddr);
 5840     __ add(tmpAddr, coeffs, 384);
 5841     store64shorts(vs3, tmpAddr);
 5842 
 5843     // level 1
 5844     // restore montmul constants
 5845     vs_ldpq(vq, kyberConsts);
 5846     __ add(tmpAddr, coeffs, 128);
 5847     load64shorts(vs1, tmpAddr);
 5848     load64shorts(vs2, zetas);
 5849     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5850     __ add(tmpAddr, coeffs, 0);
 5851     load64shorts(vs1, tmpAddr);
 5852     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5853     vs_addv(vs1, __ T8H, vs1, vs2);
 5854     __ add(tmpAddr, coeffs, 0);
 5855     store64shorts(vs1, tmpAddr);
 5856     store64shorts(vs3, tmpAddr);
 5857     vs_ldpq(vq, kyberConsts);
 5858     __ add(tmpAddr, coeffs, 384);
 5859     load64shorts(vs1, tmpAddr);
 5860     load64shorts(vs2, zetas);
 5861     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5862     __ add(tmpAddr, coeffs, 256);
 5863     load64shorts(vs1, tmpAddr);
 5864     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5865     vs_addv(vs1, __ T8H, vs1, vs2);
 5866     __ add(tmpAddr, coeffs, 256);
 5867     store64shorts(vs1, tmpAddr);
 5868     store64shorts(vs3, tmpAddr);
 5869 
 5870     // level 2
 5871     vs_ldpq(vq, kyberConsts);
 5872     int offsets1[4] = { 0, 32, 128, 160 };
 5873     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5874     load64shorts(vs2, zetas);
 5875     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5876     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5877     // kyber_subv_addv64();
 5878     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5879     vs_addv(vs1, __ T8H, vs1, vs2);
 5880     __ add(tmpAddr, coeffs, 0);
 5881     vs_stpq_post(vs_front(vs1), tmpAddr);
 5882     vs_stpq_post(vs_front(vs3), tmpAddr);
 5883     vs_stpq_post(vs_back(vs1), tmpAddr);
 5884     vs_stpq_post(vs_back(vs3), tmpAddr);
 5885     vs_ldpq(vq, kyberConsts);
 5886     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5887     load64shorts(vs2, zetas);
 5888     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5889     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5890     // kyber_subv_addv64();
 5891     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5892     vs_addv(vs1, __ T8H, vs1, vs2);
 5893     __ add(tmpAddr, coeffs, 256);
 5894     vs_stpq_post(vs_front(vs1), tmpAddr);
 5895     vs_stpq_post(vs_front(vs3), tmpAddr);
 5896     vs_stpq_post(vs_back(vs1), tmpAddr);
 5897     vs_stpq_post(vs_back(vs3), tmpAddr);
 5898 
 5899     // level 3
 5900     vs_ldpq(vq, kyberConsts);
 5901     int offsets2[4] = { 0, 64, 128, 192 };
 5902     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5903     load64shorts(vs2, zetas);
 5904     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5905     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5906     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5907     vs_addv(vs1, __ T8H, vs1, vs2);
 5908     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5909     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5910 
 5911     vs_ldpq(vq, kyberConsts);
 5912     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5913     load64shorts(vs2, zetas);
 5914     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5915     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5916     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5917     vs_addv(vs1, __ T8H, vs1, vs2);
 5918     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5919     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5920 
 5921     // level 4
 5922     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5923     // so they are loaded using employing an ldr at 8 distinct offsets.
 5924 
 5925     vs_ldpq(vq, kyberConsts);
 5926     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5927     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5928     load64shorts(vs2, zetas);
 5929     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5930     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5931     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5932     vs_addv(vs1, __ T8H, vs1, vs2);
 5933     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5934     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5935 
 5936     vs_ldpq(vq, kyberConsts);
 5937     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5938     load64shorts(vs2, zetas);
 5939     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5940     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5941     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5942     vs_addv(vs1, __ T8H, vs1, vs2);
 5943     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5944     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5945 
 5946     // level 5
 5947     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5948     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5949 
 5950     vs_ldpq(vq, kyberConsts);
 5951     int offsets4[4] = { 0, 32, 64, 96 };
 5952     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5953     load32shorts(vs_front(vs2), zetas);
 5954     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5955     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5956     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5957     load32shorts(vs_front(vs2), zetas);
 5958     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5959     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5960     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5961     load32shorts(vs_front(vs2), zetas);
 5962     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5963     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5964 
 5965     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5966     load32shorts(vs_front(vs2), zetas);
 5967     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5968     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5969 
 5970     // level 6
 5971     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5972     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5973 
 5974     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5975     load32shorts(vs_front(vs2), zetas);
 5976     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5977     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5978     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5979     // __ ldpq(v18, v19, __ post(zetas, 32));
 5980     load32shorts(vs_front(vs2), zetas);
 5981     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5982     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5983 
 5984     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5985     load32shorts(vs_front(vs2), zetas);
 5986     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5987     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5988 
 5989     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5990     load32shorts(vs_front(vs2), zetas);
 5991     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5992     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5993 
 5994     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5995     __ mov(r0, zr); // return 0
 5996     __ ret(lr);
 5997 
 5998     // record the stub entry and end
 5999     store_archive_data(stub_id, start, __ pc());
 6000 
 6001     return start;
 6002   }
 6003 
 6004   // Kyber Inverse NTT function
 6005   // Implements
 6006   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 6007   //
 6008   // coeffs (short[256]) = c_rarg0
 6009   // ntt_zetas (short[256]) = c_rarg1
 6010   address generate_kyberInverseNtt() {
 6011     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 6012     int entry_count = StubInfo::entry_count(stub_id);
 6013     assert(entry_count == 1, "sanity check");
 6014     address start = load_archive_data(stub_id);
 6015     if (start != nullptr) {
 6016       return start;
 6017     }
 6018     __ align(CodeEntryAlignment);
 6019     StubCodeMark mark(this, stub_id);
 6020     start = __ pc();
 6021     __ enter();
 6022 
 6023     const Register coeffs = c_rarg0;
 6024     const Register zetas = c_rarg1;
 6025 
 6026     const Register kyberConsts = r10;
 6027     const Register tmpAddr = r11;
 6028     const Register tmpAddr2 = c_rarg2;
 6029 
 6030     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 6031     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6032     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6033 
 6034     __ lea(kyberConsts,
 6035              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6036 
 6037     // level 0
 6038     // At level 0 related coefficients occur in discrete blocks of size 4 so
 6039     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 6040 
 6041     vs_ldpq(vq, kyberConsts);
 6042     int offsets4[4] = { 0, 32, 64, 96 };
 6043     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6044     load32shorts(vs_front(vs2), zetas);
 6045     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6046                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6047     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6048     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6049     load32shorts(vs_front(vs2), zetas);
 6050     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6051                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6052     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6053     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6054     load32shorts(vs_front(vs2), zetas);
 6055     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6056                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6057     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6058     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6059     load32shorts(vs_front(vs2), zetas);
 6060     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6061                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6062     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6063 
 6064     // level 1
 6065     // At level 1 related coefficients occur in discrete blocks of size 8 so
 6066     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 6067 
 6068     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6069     load32shorts(vs_front(vs2), zetas);
 6070     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6071                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6072     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6073     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6074     load32shorts(vs_front(vs2), zetas);
 6075     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6076                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6077     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6078 
 6079     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6080     load32shorts(vs_front(vs2), zetas);
 6081     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6082                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6083     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6084     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6085     load32shorts(vs_front(vs2), zetas);
 6086     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6087                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6088     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6089 
 6090     // level 2
 6091     // At level 2 coefficients occur in 8 discrete blocks of size 16
 6092     // so they are loaded using employing an ldr at 8 distinct offsets.
 6093 
 6094     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6095     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6096     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6097     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6098     vs_subv(vs1, __ T8H, vs1, vs2);
 6099     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 6100     load64shorts(vs2, zetas);
 6101     vs_ldpq(vq, kyberConsts);
 6102     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6103     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6104 
 6105     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6106     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6107     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6108     vs_subv(vs1, __ T8H, vs1, vs2);
 6109     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 6110     load64shorts(vs2, zetas);
 6111     vs_ldpq(vq, kyberConsts);
 6112     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6113     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6114 
 6115     // Barrett reduction at indexes where overflow may happen
 6116 
 6117     // load q and the multiplier for the Barrett reduction
 6118     __ add(tmpAddr, kyberConsts, 16);
 6119     vs_ldpq(vq, tmpAddr);
 6120 
 6121     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 6122     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 6123     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 6124     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6125     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6126     vs_sshr(vs2, __ T8H, vs2, 11);
 6127     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6128     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6129     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6130     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6131     vs_sshr(vs2, __ T8H, vs2, 11);
 6132     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6133     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6134 
 6135     // level 3
 6136     // From level 3 upwards coefficients occur in discrete blocks whose size is
 6137     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 6138 
 6139     int offsets2[4] = { 0, 64, 128, 192 };
 6140     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 6141     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 6142     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6143     vs_subv(vs1, __ T8H, vs1, vs2);
 6144     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 6145     load64shorts(vs2, zetas);
 6146     vs_ldpq(vq, kyberConsts);
 6147     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6148     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 6149 
 6150     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 6151     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6152     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6153     vs_subv(vs1, __ T8H, vs1, vs2);
 6154     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 6155     load64shorts(vs2, zetas);
 6156     vs_ldpq(vq, kyberConsts);
 6157     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6158     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6159 
 6160     // level 4
 6161 
 6162     int offsets1[4] = { 0, 32, 128, 160 };
 6163     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 6164     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 6165     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6166     vs_subv(vs1, __ T8H, vs1, vs2);
 6167     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 6168     load64shorts(vs2, zetas);
 6169     vs_ldpq(vq, kyberConsts);
 6170     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6171     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 6172 
 6173     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 6174     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6175     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6176     vs_subv(vs1, __ T8H, vs1, vs2);
 6177     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 6178     load64shorts(vs2, zetas);
 6179     vs_ldpq(vq, kyberConsts);
 6180     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6181     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6182 
 6183     // level 5
 6184 
 6185     __ add(tmpAddr, coeffs, 0);
 6186     load64shorts(vs1, tmpAddr);
 6187     __ add(tmpAddr, coeffs, 128);
 6188     load64shorts(vs2, tmpAddr);
 6189     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6190     vs_subv(vs1, __ T8H, vs1, vs2);
 6191     __ add(tmpAddr, coeffs, 0);
 6192     store64shorts(vs3, tmpAddr);
 6193     load64shorts(vs2, zetas);
 6194     vs_ldpq(vq, kyberConsts);
 6195     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6196     __ add(tmpAddr, coeffs, 128);
 6197     store64shorts(vs2, tmpAddr);
 6198 
 6199     load64shorts(vs1, tmpAddr);
 6200     __ add(tmpAddr, coeffs, 384);
 6201     load64shorts(vs2, tmpAddr);
 6202     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6203     vs_subv(vs1, __ T8H, vs1, vs2);
 6204     __ add(tmpAddr, coeffs, 256);
 6205     store64shorts(vs3, tmpAddr);
 6206     load64shorts(vs2, zetas);
 6207     vs_ldpq(vq, kyberConsts);
 6208     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6209     __ add(tmpAddr, coeffs, 384);
 6210     store64shorts(vs2, tmpAddr);
 6211 
 6212     // Barrett reduction at indexes where overflow may happen
 6213 
 6214     // load q and the multiplier for the Barrett reduction
 6215     __ add(tmpAddr, kyberConsts, 16);
 6216     vs_ldpq(vq, tmpAddr);
 6217 
 6218     int offsets0[2] = { 0, 256 };
 6219     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6220     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6221     vs_sshr(vs2, __ T8H, vs2, 11);
 6222     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6223     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6224 
 6225     // level 6
 6226 
 6227     __ add(tmpAddr, coeffs, 0);
 6228     load64shorts(vs1, tmpAddr);
 6229     __ add(tmpAddr, coeffs, 256);
 6230     load64shorts(vs2, tmpAddr);
 6231     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6232     vs_subv(vs1, __ T8H, vs1, vs2);
 6233     __ add(tmpAddr, coeffs, 0);
 6234     store64shorts(vs3, tmpAddr);
 6235     load64shorts(vs2, zetas);
 6236     vs_ldpq(vq, kyberConsts);
 6237     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6238     __ add(tmpAddr, coeffs, 256);
 6239     store64shorts(vs2, tmpAddr);
 6240 
 6241     __ add(tmpAddr, coeffs, 128);
 6242     load64shorts(vs1, tmpAddr);
 6243     __ add(tmpAddr, coeffs, 384);
 6244     load64shorts(vs2, tmpAddr);
 6245     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6246     vs_subv(vs1, __ T8H, vs1, vs2);
 6247     __ add(tmpAddr, coeffs, 128);
 6248     store64shorts(vs3, tmpAddr);
 6249     load64shorts(vs2, zetas);
 6250     vs_ldpq(vq, kyberConsts);
 6251     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6252     __ add(tmpAddr, coeffs, 384);
 6253     store64shorts(vs2, tmpAddr);
 6254 
 6255     // multiply by 2^-n
 6256 
 6257     // load toMont(2^-n mod q)
 6258     __ add(tmpAddr, kyberConsts, 48);
 6259     __ ldr(v29, __ Q, tmpAddr);
 6260 
 6261     vs_ldpq(vq, kyberConsts);
 6262     __ add(tmpAddr, coeffs, 0);
 6263     load64shorts(vs1, tmpAddr);
 6264     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6265     __ add(tmpAddr, coeffs, 0);
 6266     store64shorts(vs2, tmpAddr);
 6267 
 6268     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 6269     load64shorts(vs1, tmpAddr);
 6270     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6271     __ add(tmpAddr, coeffs, 128);
 6272     store64shorts(vs2, tmpAddr);
 6273 
 6274     // now tmpAddr contains coeffs + 256
 6275     load64shorts(vs1, tmpAddr);
 6276     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6277     __ add(tmpAddr, coeffs, 256);
 6278     store64shorts(vs2, tmpAddr);
 6279 
 6280     // now tmpAddr contains coeffs + 384
 6281     load64shorts(vs1, tmpAddr);
 6282     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6283     __ add(tmpAddr, coeffs, 384);
 6284     store64shorts(vs2, tmpAddr);
 6285 
 6286     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6287     __ mov(r0, zr); // return 0
 6288     __ ret(lr);
 6289 
 6290     // record the stub entry and end
 6291     store_archive_data(stub_id, start, __ pc());
 6292 
 6293     return start;
 6294   }
 6295 
 6296   // Kyber multiply polynomials in the NTT domain.
 6297   // Implements
 6298   // static int implKyberNttMult(
 6299   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 6300   //
 6301   // result (short[256]) = c_rarg0
 6302   // ntta (short[256]) = c_rarg1
 6303   // nttb (short[256]) = c_rarg2
 6304   // zetas (short[128]) = c_rarg3
 6305   address generate_kyberNttMult() {
 6306     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 6307     int entry_count = StubInfo::entry_count(stub_id);
 6308     assert(entry_count == 1, "sanity check");
 6309     address start = load_archive_data(stub_id);
 6310     if (start != nullptr) {
 6311       return start;
 6312     }
 6313     __ align(CodeEntryAlignment);
 6314     StubCodeMark mark(this, stub_id);
 6315     start = __ pc();
 6316     __ enter();
 6317 
 6318     const Register result = c_rarg0;
 6319     const Register ntta = c_rarg1;
 6320     const Register nttb = c_rarg2;
 6321     const Register zetas = c_rarg3;
 6322 
 6323     const Register kyberConsts = r10;
 6324     const Register limit = r11;
 6325 
 6326     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 6327     VSeq<4> vs3(16), vs4(20);
 6328     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 6329     VSeq<2> vz(28);          // pair of zetas
 6330     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 6331 
 6332     __ lea(kyberConsts,
 6333              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6334 
 6335     Label kyberNttMult_loop;
 6336 
 6337     __ add(limit, result, 512);
 6338 
 6339     // load q and qinv
 6340     vs_ldpq(vq, kyberConsts);
 6341 
 6342     // load R^2 mod q (to convert back from Montgomery representation)
 6343     __ add(kyberConsts, kyberConsts, 64);
 6344     __ ldr(v27, __ Q, kyberConsts);
 6345 
 6346     __ BIND(kyberNttMult_loop);
 6347 
 6348     // load 16 zetas
 6349     vs_ldpq_post(vz, zetas);
 6350 
 6351     // load 2 sets of 32 coefficients from the two input arrays
 6352     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 6353     // are striped across pairs of vector registers
 6354     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 6355     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 6356     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 6357     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 6358 
 6359     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 6360     // i.e. montmul the first and second halves of vs1 in order and
 6361     // then with one sequence reversed storing the two results in vs3
 6362     //
 6363     // vs3[0] <- montmul(a0, b0)
 6364     // vs3[1] <- montmul(a1, b1)
 6365     // vs3[2] <- montmul(a0, b1)
 6366     // vs3[3] <- montmul(a1, b0)
 6367     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 6368     kyber_montmul16(vs_back(vs3),
 6369                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 6370 
 6371     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 6372     // i.e. montmul the first and second halves of vs4 in order and
 6373     // then with one sequence reversed storing the two results in vs1
 6374     //
 6375     // vs1[0] <- montmul(a2, b2)
 6376     // vs1[1] <- montmul(a3, b3)
 6377     // vs1[2] <- montmul(a2, b3)
 6378     // vs1[3] <- montmul(a3, b2)
 6379     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 6380     kyber_montmul16(vs_back(vs1),
 6381                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 6382 
 6383     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 6384     // We can schedule two montmuls at a time if we use a suitable vector
 6385     // sequence <vs3[1], vs1[1]>.
 6386     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 6387     VSeq<2> vs5(vs3[1], delta);
 6388 
 6389     // vs3[1] <- montmul(montmul(a1, b1), z0)
 6390     // vs1[1] <- montmul(montmul(a3, b3), z1)
 6391     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 6392 
 6393     // add results in pairs storing in vs3
 6394     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 6395     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 6396     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 6397 
 6398     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 6399     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 6400     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 6401 
 6402     // vs1 <- montmul(vs3, montRSquareModQ)
 6403     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 6404 
 6405     // store back the two pairs of result vectors de-interleaved as 8H elements
 6406     // i.e. storing each pairs of shorts striped across a register pair adjacent
 6407     // in memory
 6408     vs_st2_post(vs1, __ T8H, result);
 6409 
 6410     __ cmp(result, limit);
 6411     __ br(Assembler::NE, kyberNttMult_loop);
 6412 
 6413     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6414     __ mov(r0, zr); // return 0
 6415     __ ret(lr);
 6416 
 6417     // record the stub entry and end
 6418     store_archive_data(stub_id, start, __ pc());
 6419 
 6420     return start;
 6421   }
 6422 
 6423   // Kyber add 2 polynomials.
 6424   // Implements
 6425   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 6426   //
 6427   // result (short[256]) = c_rarg0
 6428   // a (short[256]) = c_rarg1
 6429   // b (short[256]) = c_rarg2
 6430   address generate_kyberAddPoly_2() {
 6431     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 6432     int entry_count = StubInfo::entry_count(stub_id);
 6433     assert(entry_count == 1, "sanity check");
 6434     address start = load_archive_data(stub_id);
 6435     if (start != nullptr) {
 6436       return start;
 6437     }
 6438     __ align(CodeEntryAlignment);
 6439     StubCodeMark mark(this, stub_id);
 6440     start = __ pc();
 6441     __ enter();
 6442 
 6443     const Register result = c_rarg0;
 6444     const Register a = c_rarg1;
 6445     const Register b = c_rarg2;
 6446 
 6447     const Register kyberConsts = r11;
 6448 
 6449     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 6450     // So, we can load, add and store the data in 3 groups of 11,
 6451     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 6452     // registers. A further constraint is that the mapping needs
 6453     // to skip callee saves. So, we allocate the register
 6454     // sequences using two 8 sequences, two 2 sequences and two
 6455     // single registers.
 6456     VSeq<8> vs1_1(0);
 6457     VSeq<2> vs1_2(16);
 6458     FloatRegister vs1_3 = v28;
 6459     VSeq<8> vs2_1(18);
 6460     VSeq<2> vs2_2(26);
 6461     FloatRegister vs2_3 = v29;
 6462 
 6463     // two constant vector sequences
 6464     VSeq<8> vc_1(31, 0);
 6465     VSeq<2> vc_2(31, 0);
 6466 
 6467     FloatRegister vc_3 = v31;
 6468     __ lea(kyberConsts,
 6469              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6470 
 6471     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6472     for (int i = 0; i < 3; i++) {
 6473       // load 80 or 88 values from a into vs1_1/2/3
 6474       vs_ldpq_post(vs1_1, a);
 6475       vs_ldpq_post(vs1_2, a);
 6476       if (i < 2) {
 6477         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6478       }
 6479       // load 80 or 88 values from b into vs2_1/2/3
 6480       vs_ldpq_post(vs2_1, b);
 6481       vs_ldpq_post(vs2_2, b);
 6482       if (i < 2) {
 6483         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6484       }
 6485       // sum 80 or 88 values across vs1 and vs2 into vs1
 6486       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6487       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6488       if (i < 2) {
 6489         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6490       }
 6491       // add constant to all 80 or 88 results
 6492       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6493       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6494       if (i < 2) {
 6495         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6496       }
 6497       // store 80 or 88 values
 6498       vs_stpq_post(vs1_1, result);
 6499       vs_stpq_post(vs1_2, result);
 6500       if (i < 2) {
 6501         __ str(vs1_3, __ Q, __ post(result, 16));
 6502       }
 6503     }
 6504 
 6505     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6506     __ mov(r0, zr); // return 0
 6507     __ ret(lr);
 6508 
 6509     // record the stub entry and end
 6510     store_archive_data(stub_id, start, __ pc());
 6511 
 6512     return start;
 6513   }
 6514 
 6515   // Kyber add 3 polynomials.
 6516   // Implements
 6517   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 6518   //
 6519   // result (short[256]) = c_rarg0
 6520   // a (short[256]) = c_rarg1
 6521   // b (short[256]) = c_rarg2
 6522   // c (short[256]) = c_rarg3
 6523   address generate_kyberAddPoly_3() {
 6524     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 6525     int entry_count = StubInfo::entry_count(stub_id);
 6526     assert(entry_count == 1, "sanity check");
 6527     address start = load_archive_data(stub_id);
 6528     if (start != nullptr) {
 6529       return start;
 6530     }
 6531     __ align(CodeEntryAlignment);
 6532     StubCodeMark mark(this, stub_id);
 6533     start = __ pc();
 6534     __ enter();
 6535 
 6536     const Register result = c_rarg0;
 6537     const Register a = c_rarg1;
 6538     const Register b = c_rarg2;
 6539     const Register c = c_rarg3;
 6540 
 6541     const Register kyberConsts = r11;
 6542 
 6543     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6544     // quadwords.  So, we can load, add and store the data in 3
 6545     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6546     // of 10 or 11 registers. A further constraint is that the
 6547     // mapping needs to skip callee saves. So, we allocate the
 6548     // register sequences using two 8 sequences, two 2 sequences
 6549     // and two single registers.
 6550     VSeq<8> vs1_1(0);
 6551     VSeq<2> vs1_2(16);
 6552     FloatRegister vs1_3 = v28;
 6553     VSeq<8> vs2_1(18);
 6554     VSeq<2> vs2_2(26);
 6555     FloatRegister vs2_3 = v29;
 6556 
 6557     // two constant vector sequences
 6558     VSeq<8> vc_1(31, 0);
 6559     VSeq<2> vc_2(31, 0);
 6560 
 6561     FloatRegister vc_3 = v31;
 6562 
 6563     __ lea(kyberConsts,
 6564              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6565 
 6566     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6567     for (int i = 0; i < 3; i++) {
 6568       // load 80 or 88 values from a into vs1_1/2/3
 6569       vs_ldpq_post(vs1_1, a);
 6570       vs_ldpq_post(vs1_2, a);
 6571       if (i < 2) {
 6572         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6573       }
 6574       // load 80 or 88 values from b into vs2_1/2/3
 6575       vs_ldpq_post(vs2_1, b);
 6576       vs_ldpq_post(vs2_2, b);
 6577       if (i < 2) {
 6578         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6579       }
 6580       // sum 80 or 88 values across vs1 and vs2 into vs1
 6581       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6582       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6583       if (i < 2) {
 6584         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6585       }
 6586       // load 80 or 88 values from c into vs2_1/2/3
 6587       vs_ldpq_post(vs2_1, c);
 6588       vs_ldpq_post(vs2_2, c);
 6589       if (i < 2) {
 6590         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6591       }
 6592       // sum 80 or 88 values across vs1 and vs2 into vs1
 6593       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6594       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6595       if (i < 2) {
 6596         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6597       }
 6598       // add constant to all 80 or 88 results
 6599       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6600       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6601       if (i < 2) {
 6602         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6603       }
 6604       // store 80 or 88 values
 6605       vs_stpq_post(vs1_1, result);
 6606       vs_stpq_post(vs1_2, result);
 6607       if (i < 2) {
 6608         __ str(vs1_3, __ Q, __ post(result, 16));
 6609       }
 6610     }
 6611 
 6612     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6613     __ mov(r0, zr); // return 0
 6614     __ ret(lr);
 6615 
 6616     // record the stub entry and end
 6617     store_archive_data(stub_id, start, __ pc());
 6618 
 6619     return start;
 6620   }
 6621 
 6622   // Kyber parse XOF output to polynomial coefficient candidates
 6623   // or decodePoly(12, ...).
 6624   // Implements
 6625   // static int implKyber12To16(
 6626   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6627   //
 6628   // we assume that parsed and condensed are allocated such that for
 6629   // n = (parsedLength + 63) / 64
 6630   // n blocks of 96 bytes of input can be processed, i.e.
 6631   // index + n * 96 <= condensed.length and
 6632   // n * 64 <= parsed.length
 6633   //
 6634   // condensed (byte[]) = c_rarg0
 6635   // condensedIndex = c_rarg1
 6636   // parsed (short[]) = c_rarg2
 6637   // parsedLength = c_rarg3
 6638   address generate_kyber12To16() {
 6639     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6640     int entry_count = StubInfo::entry_count(stub_id);
 6641     assert(entry_count == 1, "sanity check");
 6642     address start = load_archive_data(stub_id);
 6643     if (start != nullptr) {
 6644       return start;
 6645     }
 6646     Label L_F00, L_loop;
 6647 
 6648     __ align(CodeEntryAlignment);
 6649     StubCodeMark mark(this, stub_id);
 6650     start = __ pc();
 6651     __ enter();
 6652 
 6653     const Register condensed = c_rarg0;
 6654     const Register condensedOffs = c_rarg1;
 6655     const Register parsed = c_rarg2;
 6656     const Register parsedLength = c_rarg3;
 6657 
 6658     const Register tmpAddr = r11;
 6659 
 6660     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6661     // quadwords so we need a 6 vector sequence for the inputs.
 6662     // Parsing produces 64 shorts, employing two 8 vector
 6663     // sequences to store and combine the intermediate data.
 6664     VSeq<6> vin(24);
 6665     VSeq<8> va(0), vb(16);
 6666 
 6667     __ adr(tmpAddr, L_F00);
 6668     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6669     __ add(condensed, condensed, condensedOffs);
 6670 
 6671     __ BIND(L_loop);
 6672     // load 96 (6 x 16B) byte values
 6673     vs_ld3_post(vin, __ T16B, condensed);
 6674 
 6675     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6676     // holds 48 (16x3) contiguous bytes from memory striped
 6677     // horizontally across each of the 16 byte lanes. Equivalently,
 6678     // that is 16 pairs of 12-bit integers. Likewise the back half
 6679     // holds the next 48 bytes in the same arrangement.
 6680 
 6681     // Each vector in the front half can also be viewed as a vertical
 6682     // strip across the 16 pairs of 12 bit integers. Each byte in
 6683     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6684     // byte in vin[1] stores the high 4 bits of the first int and the
 6685     // low 4 bits of the second int. Each byte in vin[2] stores the
 6686     // high 8 bits of the second int. Likewise the vectors in second
 6687     // half.
 6688 
 6689     // Converting the data to 16-bit shorts requires first of all
 6690     // expanding each of the 6 x 16B vectors into 6 corresponding
 6691     // pairs of 8H vectors. Mask, shift and add operations on the
 6692     // resulting vector pairs can be used to combine 4 and 8 bit
 6693     // parts of related 8H vector elements.
 6694     //
 6695     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6696     // twice, one copy manipulated to provide the lower 4 bits
 6697     // belonging to the first short in a pair and another copy
 6698     // manipulated to provide the higher 4 bits belonging to the
 6699     // second short in a pair. This is why the the vector sequences va
 6700     // and vb used to hold the expanded 8H elements are of length 8.
 6701 
 6702     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6703     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6704     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6705     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6706     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6707     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6708     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6709     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6710 
 6711     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6712     // and vb[4:5]
 6713     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6714     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6715     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6716     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6717     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6718     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6719 
 6720     // shift lo byte of copy 1 of the middle stripe into the high byte
 6721     __ shl(va[2], __ T8H, va[2], 8);
 6722     __ shl(va[3], __ T8H, va[3], 8);
 6723     __ shl(vb[2], __ T8H, vb[2], 8);
 6724     __ shl(vb[3], __ T8H, vb[3], 8);
 6725 
 6726     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6727     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6728     // are in bit positions [4..11].
 6729     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6730     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6731     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6732     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6733 
 6734     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6735     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6736     // copy2
 6737     __ andr(va[2], __ T16B, va[2], v31);
 6738     __ andr(va[3], __ T16B, va[3], v31);
 6739     __ ushr(va[4], __ T8H, va[4], 4);
 6740     __ ushr(va[5], __ T8H, va[5], 4);
 6741     __ andr(vb[2], __ T16B, vb[2], v31);
 6742     __ andr(vb[3], __ T16B, vb[3], v31);
 6743     __ ushr(vb[4], __ T8H, vb[4], 4);
 6744     __ ushr(vb[5], __ T8H, vb[5], 4);
 6745 
 6746     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6747     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6748     // n.b. the ordering ensures: i) inputs are consumed before they
 6749     // are overwritten ii) the order of 16-bit results across successive
 6750     // pairs of vectors in va and then vb reflects the order of the
 6751     // corresponding 12-bit inputs
 6752     __ addv(va[0], __ T8H, va[0], va[2]);
 6753     __ addv(va[2], __ T8H, va[1], va[3]);
 6754     __ addv(va[1], __ T8H, va[4], va[6]);
 6755     __ addv(va[3], __ T8H, va[5], va[7]);
 6756     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6757     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6758     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6759     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6760 
 6761     // store 64 results interleaved as shorts
 6762     vs_st2_post(vs_front(va), __ T8H, parsed);
 6763     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6764 
 6765     __ sub(parsedLength, parsedLength, 64);
 6766     __ cmp(parsedLength, (u1)0);
 6767     __ br(Assembler::GT, L_loop);
 6768 
 6769     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6770     __ mov(r0, zr); // return 0
 6771     __ ret(lr);
 6772 
 6773     // bind label and generate constant data used by this stub
 6774     __ BIND(L_F00);
 6775     __ emit_int64(0x0f000f000f000f00);
 6776     __ emit_int64(0x0f000f000f000f00);
 6777 
 6778     // record the stub entry and end
 6779     store_archive_data(stub_id, start, __ pc());
 6780 
 6781     return start;
 6782   }
 6783 
 6784   // Kyber Barrett reduce function.
 6785   // Implements
 6786   // static int implKyberBarrettReduce(short[] coeffs) {}
 6787   //
 6788   // coeffs (short[256]) = c_rarg0
 6789   address generate_kyberBarrettReduce() {
 6790     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6791     int entry_count = StubInfo::entry_count(stub_id);
 6792     assert(entry_count == 1, "sanity check");
 6793     address start = load_archive_data(stub_id);
 6794     if (start != nullptr) {
 6795       return start;
 6796     }
 6797     __ align(CodeEntryAlignment);
 6798     StubCodeMark mark(this, stub_id);
 6799     start = __ pc();
 6800     __ enter();
 6801 
 6802     const Register coeffs = c_rarg0;
 6803 
 6804     const Register kyberConsts = r10;
 6805     const Register result = r11;
 6806 
 6807     // As above we process 256 sets of values in total i.e. 32 x
 6808     // 8H quadwords. So, we can load, add and store the data in 3
 6809     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6810     // of 10 or 11 registers. A further constraint is that the
 6811     // mapping needs to skip callee saves. So, we allocate the
 6812     // register sequences using two 8 sequences, two 2 sequences
 6813     // and two single registers.
 6814     VSeq<8> vs1_1(0);
 6815     VSeq<2> vs1_2(16);
 6816     FloatRegister vs1_3 = v28;
 6817     VSeq<8> vs2_1(18);
 6818     VSeq<2> vs2_2(26);
 6819     FloatRegister vs2_3 = v29;
 6820 
 6821     // we also need a pair of corresponding constant sequences
 6822 
 6823     VSeq<8> vc1_1(30, 0);
 6824     VSeq<2> vc1_2(30, 0);
 6825     FloatRegister vc1_3 = v30; // for kyber_q
 6826 
 6827     VSeq<8> vc2_1(31, 0);
 6828     VSeq<2> vc2_2(31, 0);
 6829     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6830 
 6831     __ add(result, coeffs, 0);
 6832     __ lea(kyberConsts,
 6833              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6834 
 6835     // load q and the multiplier for the Barrett reduction
 6836     __ add(kyberConsts, kyberConsts, 16);
 6837     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6838 
 6839     for (int i = 0; i < 3; i++) {
 6840       // load 80 or 88 coefficients
 6841       vs_ldpq_post(vs1_1, coeffs);
 6842       vs_ldpq_post(vs1_2, coeffs);
 6843       if (i < 2) {
 6844         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6845       }
 6846 
 6847       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6848       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6849       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6850       if (i < 2) {
 6851         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6852       }
 6853 
 6854       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6855       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6856       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6857       if (i < 2) {
 6858         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6859       }
 6860 
 6861       // vs1 <- vs1 - vs2 * kyber_q
 6862       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6863       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6864       if (i < 2) {
 6865         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6866       }
 6867 
 6868       vs_stpq_post(vs1_1, result);
 6869       vs_stpq_post(vs1_2, result);
 6870       if (i < 2) {
 6871         __ str(vs1_3, __ Q, __ post(result, 16));
 6872       }
 6873     }
 6874 
 6875     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6876     __ mov(r0, zr); // return 0
 6877     __ ret(lr);
 6878 
 6879     // record the stub entry and end
 6880     store_archive_data(stub_id, start, __ pc());
 6881 
 6882     return start;
 6883   }
 6884 
 6885 
 6886   // Dilithium-specific montmul helper routines that generate parallel
 6887   // code for, respectively, a single 4x4s vector sequence montmul or
 6888   // two such multiplies in a row.
 6889 
 6890   // Perform 16 32-bit Montgomery multiplications in parallel
 6891   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6892                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6893     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6894     // It will assert that the register use is valid
 6895     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6896   }
 6897 
 6898   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6899   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6900                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6901     // Schedule two successive 4x4S multiplies via the montmul helper
 6902     // on the front and back halves of va, vb and vc. The helper will
 6903     // assert that the register use has no overlap conflicts on each
 6904     // individual call but we also need to ensure that the necessary
 6905     // disjoint/equality constraints are met across both calls.
 6906 
 6907     // vb, vc, vtmp and vq must be disjoint. va must either be
 6908     // disjoint from all other registers or equal vc
 6909 
 6910     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6911     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6912     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6913 
 6914     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6915     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6916 
 6917     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6918 
 6919     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6920     assert(vs_disjoint(va, vb), "va and vb overlap");
 6921     assert(vs_disjoint(va, vq), "va and vq overlap");
 6922     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6923 
 6924     // We multiply the front and back halves of each sequence 4 at a
 6925     // time because
 6926     //
 6927     // 1) we are currently only able to get 4-way instruction
 6928     // parallelism at best
 6929     //
 6930     // 2) we need registers for the constants in vq and temporary
 6931     // scratch registers to hold intermediate results so vtmp can only
 6932     // be a VSeq<4> which means we only have 4 scratch slots.
 6933 
 6934     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6935     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6936   }
 6937 
 6938   // Perform combined montmul then add/sub on 4x4S vectors.
 6939   void dilithium_montmul16_sub_add(
 6940           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6941           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6942     // compute a = montmul(a1, c)
 6943     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6944     // ouptut a1 = a0 - a
 6945     vs_subv(va1, __ T4S, va0, vc);
 6946     //    and a0 = a0 + a
 6947     vs_addv(va0, __ T4S, va0, vc);
 6948   }
 6949 
 6950   // Perform combined add/sub then montul on 4x4S vectors.
 6951   void dilithium_sub_add_montmul16(
 6952           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6953           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6954     // compute c = a0 - a1
 6955     vs_subv(vtmp1, __ T4S, va0, va1);
 6956     // output a0 = a0 + a1
 6957     vs_addv(va0, __ T4S, va0, va1);
 6958     // output a1 = b montmul c
 6959     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6960   }
 6961 
 6962   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6963   // in the Java implementation come in sequences of at least 8, so we
 6964   // can use ldpq to collect the corresponding data into pairs of vector
 6965   // registers.
 6966   // We collect the coefficients corresponding to the 'j+l' indexes into
 6967   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6968   // then we do the (Montgomery) multiplications by the zetas in parallel
 6969   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6970   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6971   // v0-v7 and finally save the results back to the coeffs array.
 6972   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6973     const Register coeffs, const Register zetas) {
 6974     int c1 = 0;
 6975     int c2 = 512;
 6976     int startIncr;
 6977     // don't use callee save registers v8 - v15
 6978     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6979     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6980     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6981     int offsets[4] = { 0, 32, 64, 96 };
 6982 
 6983     for (int level = 0; level < 5; level++) {
 6984       int c1Start = c1;
 6985       int c2Start = c2;
 6986       if (level == 3) {
 6987         offsets[1] = 32;
 6988         offsets[2] = 128;
 6989         offsets[3] = 160;
 6990       } else if (level == 4) {
 6991         offsets[1] = 64;
 6992         offsets[2] = 128;
 6993         offsets[3] = 192;
 6994       }
 6995 
 6996       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6997       // time at 4 different offsets and multiply them in order by the
 6998       // next set of input values. So we employ indexed load and store
 6999       // pair instructions with arrangement 4S.
 7000       for (int i = 0; i < 4; i++) {
 7001         // reload q and qinv
 7002         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7003         // load 8x4S coefficients via second start pos == c2
 7004         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 7005         // load next 8x4S inputs == b
 7006         vs_ldpq_post(vs2, zetas);
 7007         // compute a == c2 * b mod MONT_Q
 7008         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7009         // load 8x4s coefficients via first start pos == c1
 7010         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7011         // compute a1 =  c1 + a
 7012         vs_addv(vs3, __ T4S, vs1, vs2);
 7013         // compute a2 =  c1 - a
 7014         vs_subv(vs1, __ T4S, vs1, vs2);
 7015         // output a1 and a2
 7016         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7017         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 7018 
 7019         int k = 4 * level + i;
 7020 
 7021         if (k > 7) {
 7022           startIncr = 256;
 7023         } else if (k == 5) {
 7024           startIncr = 384;
 7025         } else {
 7026           startIncr = 128;
 7027         }
 7028 
 7029         c1Start += startIncr;
 7030         c2Start += startIncr;
 7031       }
 7032 
 7033       c2 /= 2;
 7034     }
 7035   }
 7036 
 7037   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 7038   // Implements the method
 7039   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 7040   // of the Java class sun.security.provider
 7041   //
 7042   // coeffs (int[256]) = c_rarg0
 7043   // zetas (int[256]) = c_rarg1
 7044   address generate_dilithiumAlmostNtt() {
 7045     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 7046     int entry_count = StubInfo::entry_count(stub_id);
 7047     assert(entry_count == 1, "sanity check");
 7048     address start = load_archive_data(stub_id);
 7049     if (start != nullptr) {
 7050       return start;
 7051     }
 7052     __ align(CodeEntryAlignment);
 7053     StubCodeMark mark(this, stub_id);
 7054     start = __ pc();
 7055     __ enter();
 7056 
 7057     const Register coeffs = c_rarg0;
 7058     const Register zetas = c_rarg1;
 7059 
 7060     const Register tmpAddr = r9;
 7061     const Register dilithiumConsts = r10;
 7062     const Register result = r11;
 7063     // don't use callee save registers v8 - v15
 7064     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7065     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7066     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7067     int offsets[4] = { 0, 32, 64, 96};
 7068     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7069     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7070     __ add(result, coeffs, 0);
 7071     __ lea(dilithiumConsts,
 7072              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7073 
 7074     // Each level represents one iteration of the outer for loop of the Java version.
 7075 
 7076     // level 0-4
 7077     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 7078 
 7079     // level 5
 7080 
 7081     // At level 5 the coefficients we need to combine with the zetas
 7082     // are grouped in memory in blocks of size 4. So, for both sets of
 7083     // coefficients we load 4 adjacent values at 8 different offsets
 7084     // using an indexed ldr with register variant Q and multiply them
 7085     // in sequence order by the next set of inputs. Likewise we store
 7086     // the resuls using an indexed str with register variant Q.
 7087     for (int i = 0; i < 1024; i += 256) {
 7088       // reload constants q, qinv each iteration as they get clobbered later
 7089       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7090       // load 32 (8x4S) coefficients via first offsets = c1
 7091       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7092       // load next 32 (8x4S) inputs = b
 7093       vs_ldpq_post(vs2, zetas);
 7094       // a = b montul c1
 7095       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7096       // load 32 (8x4S) coefficients via second offsets = c2
 7097       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 7098       // add/sub with result of multiply
 7099       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 7100       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 7101       // write back new coefficients using same offsets
 7102       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 7103       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 7104     }
 7105 
 7106     // level 6
 7107     // At level 6 the coefficients we need to combine with the zetas
 7108     // are grouped in memory in pairs, the first two being montmul
 7109     // inputs and the second add/sub inputs. We can still implement
 7110     // the montmul+sub+add using 4-way parallelism but only if we
 7111     // combine the coefficients with the zetas 16 at a time. We load 8
 7112     // adjacent values at 4 different offsets using an ld2 load with
 7113     // arrangement 2D. That interleaves the lower and upper halves of
 7114     // each pair of quadwords into successive vector registers. We
 7115     // then need to montmul the 4 even elements of the coefficients
 7116     // register sequence by the zetas in order and then add/sub the 4
 7117     // odd elements of the coefficients register sequence. We use an
 7118     // equivalent st2 operation to store the results back into memory
 7119     // de-interleaved.
 7120     for (int i = 0; i < 1024; i += 128) {
 7121       // reload constants q, qinv each iteration as they get clobbered later
 7122       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7123       // load interleaved 16 (4x2D) coefficients via offsets
 7124       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7125       // load next 16 (4x4S) inputs
 7126       vs_ldpq_post(vs_front(vs2), zetas);
 7127       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7128       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7129                                   vs_front(vs2), vtmp, vq);
 7130       // store interleaved 16 (4x2D) coefficients via offsets
 7131       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7132     }
 7133 
 7134     // level 7
 7135     // At level 7 the coefficients we need to combine with the zetas
 7136     // occur singly with montmul inputs alterating with add/sub
 7137     // inputs. Once again we can use 4-way parallelism to combine 16
 7138     // zetas at a time. However, we have to load 8 adjacent values at
 7139     // 4 different offsets using an ld2 load with arrangement 4S. That
 7140     // interleaves the the odd words of each pair into one
 7141     // coefficients vector register and the even words of the pair
 7142     // into the next register. We then need to montmul the 4 even
 7143     // elements of the coefficients register sequence by the zetas in
 7144     // order and then add/sub the 4 odd elements of the coefficients
 7145     // register sequence. We use an equivalent st2 operation to store
 7146     // the results back into memory de-interleaved.
 7147 
 7148     for (int i = 0; i < 1024; i += 128) {
 7149       // reload constants q, qinv each iteration as they get clobbered later
 7150       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7151       // load interleaved 16 (4x4S) coefficients via offsets
 7152       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7153       // load next 16 (4x4S) inputs
 7154       vs_ldpq_post(vs_front(vs2), zetas);
 7155       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7156       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7157                                   vs_front(vs2), vtmp, vq);
 7158       // store interleaved 16 (4x4S) coefficients via offsets
 7159       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7160     }
 7161     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7162     __ mov(r0, zr); // return 0
 7163     __ ret(lr);
 7164 
 7165     // record the stub entry and end
 7166     store_archive_data(stub_id, start, __ pc());
 7167 
 7168     return start;
 7169   }
 7170 
 7171   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 7172   // in the Java implementation come in sequences of at least 8, so we
 7173   // can use ldpq to collect the corresponding data into pairs of vector
 7174   // registers
 7175   // We collect the coefficients that correspond to the 'j's into vs1
 7176   // the coefficiets that correspond to the 'j+l's into vs2 then
 7177   // do the additions into vs3 and the subtractions into vs1 then
 7178   // save the result of the additions, load the zetas into vs2
 7179   // do the (Montgomery) multiplications by zeta in parallel into vs2
 7180   // finally save the results back to the coeffs array
 7181   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 7182     const Register coeffs, const Register zetas) {
 7183     int c1 = 0;
 7184     int c2 = 32;
 7185     int startIncr;
 7186     int offsets[4];
 7187     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7188     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7189     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7190 
 7191     offsets[0] = 0;
 7192 
 7193     for (int level = 3; level < 8; level++) {
 7194       int c1Start = c1;
 7195       int c2Start = c2;
 7196       if (level == 3) {
 7197         offsets[1] = 64;
 7198         offsets[2] = 128;
 7199         offsets[3] = 192;
 7200       } else if (level == 4) {
 7201         offsets[1] = 32;
 7202         offsets[2] = 128;
 7203         offsets[3] = 160;
 7204       } else {
 7205         offsets[1] = 32;
 7206         offsets[2] = 64;
 7207         offsets[3] = 96;
 7208       }
 7209 
 7210       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 7211       // time at 4 different offsets and multiply them in order by the
 7212       // next set of input values. So we employ indexed load and store
 7213       // pair instructions with arrangement 4S.
 7214       for (int i = 0; i < 4; i++) {
 7215         // load v1 32 (8x4S) coefficients relative to first start index
 7216         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7217         // load v2 32 (8x4S) coefficients relative to second start index
 7218         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 7219         // a0 = v1 + v2 -- n.b. clobbers vqs
 7220         vs_addv(vs3, __ T4S, vs1, vs2);
 7221         // a1 = v1 - v2
 7222         vs_subv(vs1, __ T4S, vs1, vs2);
 7223         // save a1 relative to first start index
 7224         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7225         // load constants q, qinv each iteration as they get clobbered above
 7226         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7227         // load b next 32 (8x4S) inputs
 7228         vs_ldpq_post(vs2, zetas);
 7229         // a = a1 montmul b
 7230         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7231         // save a relative to second start index
 7232         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 7233 
 7234         int k = 4 * level + i;
 7235 
 7236         if (k < 24) {
 7237           startIncr = 256;
 7238         } else if (k == 25) {
 7239           startIncr = 384;
 7240         } else {
 7241           startIncr = 128;
 7242         }
 7243 
 7244         c1Start += startIncr;
 7245         c2Start += startIncr;
 7246       }
 7247 
 7248       c2 *= 2;
 7249     }
 7250   }
 7251 
 7252   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 7253   // Implements the method
 7254   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 7255   // the sun.security.provider.ML_DSA class.
 7256   //
 7257   // coeffs (int[256]) = c_rarg0
 7258   // zetas (int[256]) = c_rarg1
 7259   address generate_dilithiumAlmostInverseNtt() {
 7260     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 7261     int entry_count = StubInfo::entry_count(stub_id);
 7262     assert(entry_count == 1, "sanity check");
 7263     address start = load_archive_data(stub_id);
 7264     if (start != nullptr) {
 7265       return start;
 7266     }
 7267     __ align(CodeEntryAlignment);
 7268     StubCodeMark mark(this, stub_id);
 7269     start = __ pc();
 7270     __ enter();
 7271 
 7272     const Register coeffs = c_rarg0;
 7273     const Register zetas = c_rarg1;
 7274 
 7275     const Register tmpAddr = r9;
 7276     const Register dilithiumConsts = r10;
 7277     const Register result = r11;
 7278     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7279     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 7280     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7281     int offsets[4] = { 0, 32, 64, 96 };
 7282     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7283     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7284 
 7285     __ add(result, coeffs, 0);
 7286     __ lea(dilithiumConsts,
 7287              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7288 
 7289     // Each level represents one iteration of the outer for loop of the Java version
 7290 
 7291     // level 0
 7292     // At level 0 we need to interleave adjacent quartets of
 7293     // coefficients before we multiply and add/sub by the next 16
 7294     // zetas just as we did for level 7 in the multiply code. So we
 7295     // load and store the values using an ld2/st2 with arrangement 4S.
 7296     for (int i = 0; i < 1024; i += 128) {
 7297       // load constants q, qinv
 7298       // n.b. this can be moved out of the loop as they do not get
 7299       // clobbered by first two loops
 7300       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7301       // a0/a1 load interleaved 32 (8x4S) coefficients
 7302       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7303       // b load next 32 (8x4S) inputs
 7304       vs_ldpq_post(vs_front(vs2), zetas);
 7305       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7306       // n.b. second half of vs2 provides temporary register storage
 7307       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7308                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7309       // a0/a1 store interleaved 32 (8x4S) coefficients
 7310       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7311     }
 7312 
 7313     // level 1
 7314     // At level 1 we need to interleave pairs of adjacent pairs of
 7315     // coefficients before we multiply by the next 16 zetas just as we
 7316     // did for level 6 in the multiply code. So we load and store the
 7317     // values an ld2/st2 with arrangement 2D.
 7318     for (int i = 0; i < 1024; i += 128) {
 7319       // a0/a1 load interleaved 32 (8x2D) coefficients
 7320       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7321       // b load next 16 (4x4S) inputs
 7322       vs_ldpq_post(vs_front(vs2), zetas);
 7323       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7324       // n.b. second half of vs2 provides temporary register storage
 7325       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7326                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7327       // a0/a1 store interleaved 32 (8x2D) coefficients
 7328       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7329     }
 7330 
 7331     // level 2
 7332     // At level 2 coefficients come in blocks of 4. So, we load 4
 7333     // adjacent coefficients at 8 distinct offsets for both the first
 7334     // and second coefficient sequences, using an ldr with register
 7335     // variant Q then combine them with next set of 32 zetas. Likewise
 7336     // we store the results using an str with register variant Q.
 7337     for (int i = 0; i < 1024; i += 256) {
 7338       // c0 load 32 (8x4S) coefficients via first offsets
 7339       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7340       // c1 load 32 (8x4S) coefficients via second offsets
 7341       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 7342       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 7343       vs_addv(vs3, __ T4S, vs1, vs2);
 7344       // c = c0 - c1
 7345       vs_subv(vs1, __ T4S, vs1, vs2);
 7346       // store a0 32 (8x4S) coefficients via first offsets
 7347       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 7348       // b load 32 (8x4S) next inputs
 7349       vs_ldpq_post(vs2, zetas);
 7350       // reload constants q, qinv -- they were clobbered earlier
 7351       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7352       // compute a1 = b montmul c
 7353       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7354       // store a1 32 (8x4S) coefficients via second offsets
 7355       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 7356     }
 7357 
 7358     // level 3-7
 7359     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 7360 
 7361     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7362     __ mov(r0, zr); // return 0
 7363     __ ret(lr);
 7364 
 7365     // record the stub entry and end
 7366     store_archive_data(stub_id, start, __ pc());
 7367 
 7368     return start;
 7369   }
 7370 
 7371   // Dilithium multiply polynomials in the NTT domain.
 7372   // Straightforward implementation of the method
 7373   // static int implDilithiumNttMult(
 7374   //              int[] result, int[] ntta, int[] nttb {} of
 7375   // the sun.security.provider.ML_DSA class.
 7376   //
 7377   // result (int[256]) = c_rarg0
 7378   // poly1 (int[256]) = c_rarg1
 7379   // poly2 (int[256]) = c_rarg2
 7380   address generate_dilithiumNttMult() {
 7381     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 7382     int entry_count = StubInfo::entry_count(stub_id);
 7383     assert(entry_count == 1, "sanity check");
 7384     address start = load_archive_data(stub_id);
 7385     if (start != nullptr) {
 7386       return start;
 7387     }
 7388     __ align(CodeEntryAlignment);
 7389     StubCodeMark mark(this, stub_id);
 7390     start = __ pc();
 7391     __ enter();
 7392 
 7393     Label L_loop;
 7394 
 7395     const Register result = c_rarg0;
 7396     const Register poly1 = c_rarg1;
 7397     const Register poly2 = c_rarg2;
 7398 
 7399     const Register dilithiumConsts = r10;
 7400     const Register len = r11;
 7401 
 7402     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7403     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7404     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7405     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 7406 
 7407     __ lea(dilithiumConsts,
 7408              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7409 
 7410     // load constants q, qinv
 7411     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7412     // load constant rSquare into v29
 7413     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 7414 
 7415     __ mov(len, zr);
 7416     __ add(len, len, 1024);
 7417 
 7418     __ BIND(L_loop);
 7419 
 7420     // b load 32 (8x4S) next inputs from poly1
 7421     vs_ldpq_post(vs1, poly1);
 7422     // c load 32 (8x4S) next inputs from poly2
 7423     vs_ldpq_post(vs2, poly2);
 7424     // compute a = b montmul c
 7425     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7426     // compute a = rsquare montmul a
 7427     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 7428     // save a 32 (8x4S) results
 7429     vs_stpq_post(vs2, result);
 7430 
 7431     __ sub(len, len, 128);
 7432     __ cmp(len, (u1)128);
 7433     __ br(Assembler::GE, L_loop);
 7434 
 7435     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7436     __ mov(r0, zr); // return 0
 7437     __ ret(lr);
 7438 
 7439     // record the stub entry and end
 7440     store_archive_data(stub_id, start, __ pc());
 7441 
 7442     return start;
 7443   }
 7444 
 7445   // Dilithium Motgomery multiply an array by a constant.
 7446   // A straightforward implementation of the method
 7447   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 7448   // of the sun.security.provider.MLDSA class
 7449   //
 7450   // coeffs (int[256]) = c_rarg0
 7451   // constant (int) = c_rarg1
 7452   address generate_dilithiumMontMulByConstant() {
 7453     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 7454     int entry_count = StubInfo::entry_count(stub_id);
 7455     assert(entry_count == 1, "sanity check");
 7456     address start = load_archive_data(stub_id);
 7457     if (start != nullptr) {
 7458       return start;
 7459     }
 7460     __ align(CodeEntryAlignment);
 7461     StubCodeMark mark(this, stub_id);
 7462     start = __ pc();
 7463     __ enter();
 7464 
 7465     Label L_loop;
 7466 
 7467     const Register coeffs = c_rarg0;
 7468     const Register constant = c_rarg1;
 7469 
 7470     const Register dilithiumConsts = r10;
 7471     const Register result = r11;
 7472     const Register len = r12;
 7473 
 7474     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7475     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7476     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7477     VSeq<8> vconst(29, 0);             // for montmul by constant
 7478 
 7479     // results track inputs
 7480     __ add(result, coeffs, 0);
 7481     __ lea(dilithiumConsts,
 7482              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7483 
 7484     // load constants q, qinv -- they do not get clobbered by first two loops
 7485     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7486     // copy caller supplied constant across vconst
 7487     __ dup(vconst[0], __ T4S, constant);
 7488     __ mov(len, zr);
 7489     __ add(len, len, 1024);
 7490 
 7491     __ BIND(L_loop);
 7492 
 7493     // load next 32 inputs
 7494     vs_ldpq_post(vs2, coeffs);
 7495     // mont mul by constant
 7496     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 7497     // write next 32 results
 7498     vs_stpq_post(vs2, result);
 7499 
 7500     __ sub(len, len, 128);
 7501     __ cmp(len, (u1)128);
 7502     __ br(Assembler::GE, L_loop);
 7503 
 7504     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7505     __ mov(r0, zr); // return 0
 7506     __ ret(lr);
 7507 
 7508     // record the stub entry and end
 7509     store_archive_data(stub_id, start, __ pc());
 7510 
 7511     return start;
 7512   }
 7513 
 7514   // Dilithium decompose poly.
 7515   // Implements the method
 7516   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 7517   // of the sun.security.provider.ML_DSA class
 7518   //
 7519   // input (int[256]) = c_rarg0
 7520   // lowPart (int[256]) = c_rarg1
 7521   // highPart (int[256]) = c_rarg2
 7522   // twoGamma2  (int) = c_rarg3
 7523   // multiplier (int) = c_rarg4
 7524   address generate_dilithiumDecomposePoly() {
 7525     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 7526     int entry_count = StubInfo::entry_count(stub_id);
 7527     assert(entry_count == 1, "sanity check");
 7528     address start = load_archive_data(stub_id);
 7529     if (start != nullptr) {
 7530       return start;
 7531     }
 7532     __ align(CodeEntryAlignment);
 7533     StubCodeMark mark(this, stub_id);
 7534     start = __ pc();
 7535     Label L_loop;
 7536 
 7537     const Register input = c_rarg0;
 7538     const Register lowPart = c_rarg1;
 7539     const Register highPart = c_rarg2;
 7540     const Register twoGamma2 = c_rarg3;
 7541     const Register multiplier = c_rarg4;
 7542 
 7543     const Register len = r9;
 7544     const Register dilithiumConsts = r10;
 7545     const Register tmp = r11;
 7546 
 7547     // 6 independent sets of 4x4s values
 7548     VSeq<4> vs1(0), vs2(4), vs3(8);
 7549     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7550 
 7551     // 7 constants for cross-multiplying
 7552     VSeq<4> one(25, 0);
 7553     VSeq<4> qminus1(26, 0);
 7554     VSeq<4> g2(27, 0);
 7555     VSeq<4> twog2(28, 0);
 7556     VSeq<4> mult(29, 0);
 7557     VSeq<4> q(30, 0);
 7558     VSeq<4> qadd(31, 0);
 7559 
 7560     __ enter();
 7561 
 7562     __ lea(dilithiumConsts,
 7563              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7564 
 7565     // save callee-saved registers
 7566     __ stpd(v8, v9, __ pre(sp, -64));
 7567     __ stpd(v10, v11, Address(sp, 16));
 7568     __ stpd(v12, v13, Address(sp, 32));
 7569     __ stpd(v14, v15, Address(sp, 48));
 7570 
 7571     // populate constant registers
 7572     __ mov(tmp, zr);
 7573     __ add(tmp, tmp, 1);
 7574     __ dup(one[0], __ T4S, tmp); // 1
 7575     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7576     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7577     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7578     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7579     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7580     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7581 
 7582     __ mov(len, zr);
 7583     __ add(len, len, 1024);
 7584 
 7585     __ BIND(L_loop);
 7586 
 7587     // load next 4x4S inputs interleaved: rplus --> vs1
 7588     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7589 
 7590     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7591     vs_addv(vtmp, __ T4S, vs1, qadd);
 7592     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7593     vs_mulv(vtmp, __ T4S, vtmp, q);
 7594     vs_subv(vs1, __ T4S, vs1, vtmp);
 7595 
 7596     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7597     vs_sshr(vtmp, __ T4S, vs1, 31);
 7598     vs_andr(vtmp, vtmp, q);
 7599     vs_addv(vs1, __ T4S, vs1, vtmp);
 7600 
 7601     // quotient --> vs2
 7602     // int quotient = (rplus * multiplier) >> 22;
 7603     vs_mulv(vtmp, __ T4S, vs1, mult);
 7604     vs_sshr(vs2, __ T4S, vtmp, 22);
 7605 
 7606     // r0 --> vs3
 7607     // int r0 = rplus - quotient * twoGamma2;
 7608     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7609     vs_subv(vs3, __ T4S, vs1, vtmp);
 7610 
 7611     // mask --> vs4
 7612     // int mask = (twoGamma2 - r0) >> 22;
 7613     vs_subv(vtmp, __ T4S, twog2, vs3);
 7614     vs_sshr(vs4, __ T4S, vtmp, 22);
 7615 
 7616     // r0 -= (mask & twoGamma2);
 7617     vs_andr(vtmp, vs4, twog2);
 7618     vs_subv(vs3, __ T4S, vs3, vtmp);
 7619 
 7620     //  quotient += (mask & 1);
 7621     vs_andr(vtmp, vs4, one);
 7622     vs_addv(vs2, __ T4S, vs2, vtmp);
 7623 
 7624     // mask = (twoGamma2 / 2 - r0) >> 31;
 7625     vs_subv(vtmp, __ T4S, g2, vs3);
 7626     vs_sshr(vs4, __ T4S, vtmp, 31);
 7627 
 7628     // r0 -= (mask & twoGamma2);
 7629     vs_andr(vtmp, vs4, twog2);
 7630     vs_subv(vs3, __ T4S, vs3, vtmp);
 7631 
 7632     // quotient += (mask & 1);
 7633     vs_andr(vtmp, vs4, one);
 7634     vs_addv(vs2, __ T4S, vs2, vtmp);
 7635 
 7636     // r1 --> vs5
 7637     // int r1 = rplus - r0 - (dilithium_q - 1);
 7638     vs_subv(vtmp, __ T4S, vs1, vs3);
 7639     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7640 
 7641     // r1 --> vs1 (overwriting rplus)
 7642     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7643     vs_negr(vtmp, __ T4S, vs5);
 7644     vs_orr(vtmp, vs5, vtmp);
 7645     vs_sshr(vs1, __ T4S, vtmp, 31);
 7646 
 7647     // r0 += ~r1;
 7648     vs_notr(vtmp, vs1);
 7649     vs_addv(vs3, __ T4S, vs3, vtmp);
 7650 
 7651     // r1 = r1 & quotient;
 7652     vs_andr(vs1, vs2, vs1);
 7653 
 7654     // store results inteleaved
 7655     // lowPart[m] = r0;
 7656     // highPart[m] = r1;
 7657     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7658     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7659 
 7660     __ sub(len, len, 64);
 7661     __ cmp(len, (u1)64);
 7662     __ br(Assembler::GE, L_loop);
 7663 
 7664     // restore callee-saved vector registers
 7665     __ ldpd(v14, v15, Address(sp, 48));
 7666     __ ldpd(v12, v13, Address(sp, 32));
 7667     __ ldpd(v10, v11, Address(sp, 16));
 7668     __ ldpd(v8, v9, __ post(sp, 64));
 7669 
 7670     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7671     __ mov(r0, zr); // return 0
 7672     __ ret(lr);
 7673 
 7674     // record the stub entry and end
 7675     store_archive_data(stub_id, start, __ pc());
 7676 
 7677     return start;
 7678   }
 7679 
 7680   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7681              Register tmp0, Register tmp1, Register tmp2) {
 7682     __ bic(tmp0, a2, a1); // for a0
 7683     __ bic(tmp1, a3, a2); // for a1
 7684     __ bic(tmp2, a4, a3); // for a2
 7685     __ eor(a2, a2, tmp2);
 7686     __ bic(tmp2, a0, a4); // for a3
 7687     __ eor(a3, a3, tmp2);
 7688     __ bic(tmp2, a1, a0); // for a4
 7689     __ eor(a0, a0, tmp0);
 7690     __ eor(a1, a1, tmp1);
 7691     __ eor(a4, a4, tmp2);
 7692   }
 7693 
 7694   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7695                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7696                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7697                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7698                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7699                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7700                         Register tmp0, Register tmp1, Register tmp2) {
 7701     __ eor3(tmp1, a4, a9, a14);
 7702     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7703     __ eor3(tmp2, a1, a6, a11);
 7704     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7705     __ rax1(tmp2, tmp0, tmp1); // d0
 7706     {
 7707 
 7708       Register tmp3, tmp4;
 7709       if (can_use_fp && can_use_r18) {
 7710         tmp3 = rfp;
 7711         tmp4 = r18_tls;
 7712       } else {
 7713         tmp3 = a4;
 7714         tmp4 = a9;
 7715         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7716       }
 7717 
 7718       __ eor3(tmp3, a0, a5, a10);
 7719       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7720       __ eor(a0, a0, tmp2);
 7721       __ eor(a5, a5, tmp2);
 7722       __ eor(a10, a10, tmp2);
 7723       __ eor(a15, a15, tmp2);
 7724       __ eor(a20, a20, tmp2); // d0(tmp2)
 7725       __ eor3(tmp3, a2, a7, a12);
 7726       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7727       __ rax1(tmp3, tmp4, tmp2); // d1
 7728       __ eor(a1, a1, tmp3);
 7729       __ eor(a6, a6, tmp3);
 7730       __ eor(a11, a11, tmp3);
 7731       __ eor(a16, a16, tmp3);
 7732       __ eor(a21, a21, tmp3); // d1(tmp3)
 7733       __ rax1(tmp3, tmp2, tmp0); // d3
 7734       __ eor3(tmp2, a3, a8, a13);
 7735       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7736       __ eor(a3, a3, tmp3);
 7737       __ eor(a8, a8, tmp3);
 7738       __ eor(a13, a13, tmp3);
 7739       __ eor(a18, a18, tmp3);
 7740       __ eor(a23, a23, tmp3);
 7741       __ rax1(tmp2, tmp1, tmp0); // d2
 7742       __ eor(a2, a2, tmp2);
 7743       __ eor(a7, a7, tmp2);
 7744       __ eor(a12, a12, tmp2);
 7745       __ rax1(tmp0, tmp0, tmp4); // d4
 7746       if (!can_use_fp || !can_use_r18) {
 7747         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7748       }
 7749       __ eor(a17, a17, tmp2);
 7750       __ eor(a22, a22, tmp2);
 7751       __ eor(a4, a4, tmp0);
 7752       __ eor(a9, a9, tmp0);
 7753       __ eor(a14, a14, tmp0);
 7754       __ eor(a19, a19, tmp0);
 7755       __ eor(a24, a24, tmp0);
 7756     }
 7757 
 7758     __ rol(tmp0, a10, 3);
 7759     __ rol(a10, a1, 1);
 7760     __ rol(a1, a6, 44);
 7761     __ rol(a6, a9, 20);
 7762     __ rol(a9, a22, 61);
 7763     __ rol(a22, a14, 39);
 7764     __ rol(a14, a20, 18);
 7765     __ rol(a20, a2, 62);
 7766     __ rol(a2, a12, 43);
 7767     __ rol(a12, a13, 25);
 7768     __ rol(a13, a19, 8) ;
 7769     __ rol(a19, a23, 56);
 7770     __ rol(a23, a15, 41);
 7771     __ rol(a15, a4, 27);
 7772     __ rol(a4, a24, 14);
 7773     __ rol(a24, a21, 2);
 7774     __ rol(a21, a8, 55);
 7775     __ rol(a8, a16, 45);
 7776     __ rol(a16, a5, 36);
 7777     __ rol(a5, a3, 28);
 7778     __ rol(a3, a18, 21);
 7779     __ rol(a18, a17, 15);
 7780     __ rol(a17, a11, 10);
 7781     __ rol(a11, a7, 6);
 7782     __ mov(a7, tmp0);
 7783 
 7784     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7785     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7786     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7787     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7788     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7789 
 7790     __ ldr(tmp1, __ post(rc, 8));
 7791     __ eor(a0, a0, tmp1);
 7792 
 7793   }
 7794 
 7795   // Arguments:
 7796   //
 7797   // Inputs:
 7798   //   c_rarg0   - byte[]  source+offset
 7799   //   c_rarg1   - byte[]  SHA.state
 7800   //   c_rarg2   - int     block_size
 7801   //   c_rarg3   - int     offset
 7802   //   c_rarg4   - int     limit
 7803   //
 7804   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7805     bool multi_block;
 7806     switch (stub_id) {
 7807     case StubId::stubgen_sha3_implCompress_id:
 7808       multi_block = false;
 7809       break;
 7810     case StubId::stubgen_sha3_implCompressMB_id:
 7811       multi_block = true;
 7812       break;
 7813     default:
 7814       ShouldNotReachHere();
 7815     }
 7816     int entry_count = StubInfo::entry_count(stub_id);
 7817     assert(entry_count == 1, "sanity check");
 7818     address start = load_archive_data(stub_id);
 7819     if (start != nullptr) {
 7820       return start;
 7821     }
 7822     __ align(CodeEntryAlignment);
 7823     StubCodeMark mark(this, stub_id);
 7824     start = __ pc();
 7825 
 7826     Register buf           = c_rarg0;
 7827     Register state         = c_rarg1;
 7828     Register block_size    = c_rarg2;
 7829     Register ofs           = c_rarg3;
 7830     Register limit         = c_rarg4;
 7831 
 7832     // use r3.r17,r19..r28 to keep a0..a24.
 7833     // a0..a24 are respective locals from SHA3.java
 7834     Register a0 = r25,
 7835              a1 = r26,
 7836              a2 = r27,
 7837              a3 = r3,
 7838              a4 = r4,
 7839              a5 = r5,
 7840              a6 = r6,
 7841              a7 = r7,
 7842              a8 = rscratch1, // r8
 7843              a9 = rscratch2, // r9
 7844              a10 = r10,
 7845              a11 = r11,
 7846              a12 = r12,
 7847              a13 = r13,
 7848              a14 = r14,
 7849              a15 = r15,
 7850              a16 = r16,
 7851              a17 = r17,
 7852              a18 = r28,
 7853              a19 = r19,
 7854              a20 = r20,
 7855              a21 = r21,
 7856              a22 = r22,
 7857              a23 = r23,
 7858              a24 = r24;
 7859 
 7860     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7861 
 7862     Label sha3_loop, rounds24_preloop, loop_body;
 7863     Label sha3_512_or_sha3_384, shake128;
 7864 
 7865     bool can_use_r18 = false;
 7866 #ifndef R18_RESERVED
 7867     can_use_r18 = true;
 7868 #endif
 7869     bool can_use_fp = !PreserveFramePointer;
 7870 
 7871     __ enter();
 7872 
 7873     // save almost all yet unsaved gpr registers on stack
 7874     __ str(block_size, __ pre(sp, -128));
 7875     if (multi_block) {
 7876       __ stpw(ofs, limit, Address(sp, 8));
 7877     }
 7878     // 8 bytes at sp+16 will be used to keep buf
 7879     __ stp(r19, r20, Address(sp, 32));
 7880     __ stp(r21, r22, Address(sp, 48));
 7881     __ stp(r23, r24, Address(sp, 64));
 7882     __ stp(r25, r26, Address(sp, 80));
 7883     __ stp(r27, r28, Address(sp, 96));
 7884     if (can_use_r18 && can_use_fp) {
 7885       __ stp(r18_tls, state, Address(sp, 112));
 7886     } else {
 7887       __ str(state, Address(sp, 112));
 7888     }
 7889 
 7890     // begin sha3 calculations: loading a0..a24 from state arrary
 7891     __ ldp(a0, a1, state);
 7892     __ ldp(a2, a3, Address(state, 16));
 7893     __ ldp(a4, a5, Address(state, 32));
 7894     __ ldp(a6, a7, Address(state, 48));
 7895     __ ldp(a8, a9, Address(state, 64));
 7896     __ ldp(a10, a11, Address(state, 80));
 7897     __ ldp(a12, a13, Address(state, 96));
 7898     __ ldp(a14, a15, Address(state, 112));
 7899     __ ldp(a16, a17, Address(state, 128));
 7900     __ ldp(a18, a19, Address(state, 144));
 7901     __ ldp(a20, a21, Address(state, 160));
 7902     __ ldp(a22, a23, Address(state, 176));
 7903     __ ldr(a24, Address(state, 192));
 7904 
 7905     __ BIND(sha3_loop);
 7906 
 7907     // load input
 7908     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7909     __ eor(a0, a0, tmp3);
 7910     __ eor(a1, a1, tmp2);
 7911     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7912     __ eor(a2, a2, tmp3);
 7913     __ eor(a3, a3, tmp2);
 7914     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7915     __ eor(a4, a4, tmp3);
 7916     __ eor(a5, a5, tmp2);
 7917     __ ldr(tmp3, __ post(buf, 8));
 7918     __ eor(a6, a6, tmp3);
 7919 
 7920     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7921     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7922 
 7923     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7924     __ eor(a7, a7, tmp3);
 7925     __ eor(a8, a8, tmp2);
 7926     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7927     __ eor(a9, a9, tmp3);
 7928     __ eor(a10, a10, tmp2);
 7929     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7930     __ eor(a11, a11, tmp3);
 7931     __ eor(a12, a12, tmp2);
 7932     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7933     __ eor(a13, a13, tmp3);
 7934     __ eor(a14, a14, tmp2);
 7935     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7936     __ eor(a15, a15, tmp3);
 7937     __ eor(a16, a16, tmp2);
 7938 
 7939     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7940     __ andw(tmp2, block_size, 48);
 7941     __ cbzw(tmp2, rounds24_preloop);
 7942     __ tbnz(block_size, 5, shake128);
 7943     // block_size == 144, bit5 == 0, SHA3-244
 7944     __ ldr(tmp3, __ post(buf, 8));
 7945     __ eor(a17, a17, tmp3);
 7946     __ b(rounds24_preloop);
 7947 
 7948     __ BIND(shake128);
 7949     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7950     __ eor(a17, a17, tmp3);
 7951     __ eor(a18, a18, tmp2);
 7952     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7953     __ eor(a19, a19, tmp3);
 7954     __ eor(a20, a20, tmp2);
 7955     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7956 
 7957     __ BIND(sha3_512_or_sha3_384);
 7958     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7959     __ eor(a7, a7, tmp3);
 7960     __ eor(a8, a8, tmp2);
 7961     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7962 
 7963     // SHA3-384
 7964     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7965     __ eor(a9, a9, tmp3);
 7966     __ eor(a10, a10, tmp2);
 7967     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7968     __ eor(a11, a11, tmp3);
 7969     __ eor(a12, a12, tmp2);
 7970 
 7971     __ BIND(rounds24_preloop);
 7972     __ fmovs(v0, 24.0); // float loop counter,
 7973     __ fmovs(v1, 1.0);  // exact representation
 7974 
 7975     __ str(buf, Address(sp, 16));
 7976     __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
 7977 
 7978     __ BIND(loop_body);
 7979     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7980                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7981                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7982                      tmp0, tmp1, tmp2);
 7983     __ fsubs(v0, v0, v1);
 7984     __ fcmps(v0, 0.0);
 7985     __ br(__ NE, loop_body);
 7986 
 7987     if (multi_block) {
 7988       __ ldrw(block_size, sp); // block_size
 7989       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7990       __ addw(tmp2, tmp2, block_size);
 7991       __ cmpw(tmp2, tmp1);
 7992       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7993       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7994       __ br(Assembler::LE, sha3_loop);
 7995       __ movw(c_rarg0, tmp2); // return offset
 7996     }
 7997     if (can_use_fp && can_use_r18) {
 7998       __ ldp(r18_tls, state, Address(sp, 112));
 7999     } else {
 8000       __ ldr(state, Address(sp, 112));
 8001     }
 8002     // save calculated sha3 state
 8003     __ stp(a0, a1, Address(state));
 8004     __ stp(a2, a3, Address(state, 16));
 8005     __ stp(a4, a5, Address(state, 32));
 8006     __ stp(a6, a7, Address(state, 48));
 8007     __ stp(a8, a9, Address(state, 64));
 8008     __ stp(a10, a11, Address(state, 80));
 8009     __ stp(a12, a13, Address(state, 96));
 8010     __ stp(a14, a15, Address(state, 112));
 8011     __ stp(a16, a17, Address(state, 128));
 8012     __ stp(a18, a19, Address(state, 144));
 8013     __ stp(a20, a21, Address(state, 160));
 8014     __ stp(a22, a23, Address(state, 176));
 8015     __ str(a24, Address(state, 192));
 8016 
 8017     // restore required registers from stack
 8018     __ ldp(r19, r20, Address(sp, 32));
 8019     __ ldp(r21, r22, Address(sp, 48));
 8020     __ ldp(r23, r24, Address(sp, 64));
 8021     __ ldp(r25, r26, Address(sp, 80));
 8022     __ ldp(r27, r28, Address(sp, 96));
 8023     if (can_use_fp && can_use_r18) {
 8024       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 8025     } // else no need to recalculate rfp, since it wasn't changed
 8026 
 8027     __ leave();
 8028 
 8029     __ ret(lr);
 8030 
 8031     // record the stub entry and end
 8032     store_archive_data(stub_id, start, __ pc());
 8033 
 8034     return start;
 8035   }
 8036 
 8037   /**
 8038    *  Arguments:
 8039    *
 8040    * Inputs:
 8041    *   c_rarg0   - int crc
 8042    *   c_rarg1   - byte* buf
 8043    *   c_rarg2   - int length
 8044    *
 8045    * Output:
 8046    *       rax   - int crc result
 8047    */
 8048   address generate_updateBytesCRC32() {
 8049     assert(UseCRC32Intrinsics, "what are we doing here?");
 8050     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 8051     int entry_count = StubInfo::entry_count(stub_id);
 8052     assert(entry_count == 1, "sanity check");
 8053     address start = load_archive_data(stub_id);
 8054     if (start != nullptr) {
 8055       return start;
 8056     }
 8057     __ align(CodeEntryAlignment);
 8058     StubCodeMark mark(this, stub_id);
 8059 
 8060     start = __ pc();
 8061 
 8062     const Register crc   = c_rarg0;  // crc
 8063     const Register buf   = c_rarg1;  // source java byte array address
 8064     const Register len   = c_rarg2;  // length
 8065     const Register table0 = c_rarg3; // crc_table address
 8066     const Register table1 = c_rarg4;
 8067     const Register table2 = c_rarg5;
 8068     const Register table3 = c_rarg6;
 8069     const Register tmp3 = c_rarg7;
 8070 
 8071     BLOCK_COMMENT("Entry:");
 8072     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8073 
 8074     __ kernel_crc32(crc, buf, len,
 8075               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 8076 
 8077     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8078     __ ret(lr);
 8079 
 8080     // record the stub entry and end
 8081     store_archive_data(stub_id, start, __ pc());
 8082 
 8083     return start;
 8084   }
 8085 
 8086   /**
 8087    *  Arguments:
 8088    *
 8089    * Inputs:
 8090    *   c_rarg0   - int crc
 8091    *   c_rarg1   - byte* buf
 8092    *   c_rarg2   - int length
 8093    *   c_rarg3   - int* table
 8094    *
 8095    * Output:
 8096    *       r0   - int crc result
 8097    */
 8098   address generate_updateBytesCRC32C() {
 8099     assert(UseCRC32CIntrinsics, "what are we doing here?");
 8100     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 8101     int entry_count = StubInfo::entry_count(stub_id);
 8102     assert(entry_count == 1, "sanity check");
 8103     address start = load_archive_data(stub_id);
 8104     if (start != nullptr) {
 8105       return start;
 8106     }
 8107     __ align(CodeEntryAlignment);
 8108     StubCodeMark mark(this, stub_id);
 8109 
 8110     start = __ pc();
 8111 
 8112     const Register crc   = c_rarg0;  // crc
 8113     const Register buf   = c_rarg1;  // source java byte array address
 8114     const Register len   = c_rarg2;  // length
 8115     const Register table0 = c_rarg3; // crc_table address
 8116     const Register table1 = c_rarg4;
 8117     const Register table2 = c_rarg5;
 8118     const Register table3 = c_rarg6;
 8119     const Register tmp3 = c_rarg7;
 8120 
 8121     BLOCK_COMMENT("Entry:");
 8122     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8123 
 8124     __ kernel_crc32c(crc, buf, len,
 8125               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 8126 
 8127     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8128     __ ret(lr);
 8129 
 8130     // record the stub entry and end
 8131     store_archive_data(stub_id, start, __ pc());
 8132 
 8133     return start;
 8134   }
 8135 
 8136   /***
 8137    *  Arguments:
 8138    *
 8139    *  Inputs:
 8140    *   c_rarg0   - int   adler
 8141    *   c_rarg1   - byte* buff
 8142    *   c_rarg2   - int   len
 8143    *
 8144    * Output:
 8145    *   c_rarg0   - int adler result
 8146    */
 8147   address generate_updateBytesAdler32() {
 8148     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 8149     int entry_count = StubInfo::entry_count(stub_id);
 8150     assert(entry_count == 1, "sanity check");
 8151     address start = load_archive_data(stub_id);
 8152     if (start != nullptr) {
 8153       return start;
 8154     }
 8155     __ align(CodeEntryAlignment);
 8156     StubCodeMark mark(this, stub_id);
 8157     start = __ pc();
 8158 
 8159     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 8160 
 8161     // Aliases
 8162     Register adler  = c_rarg0;
 8163     Register s1     = c_rarg0;
 8164     Register s2     = c_rarg3;
 8165     Register buff   = c_rarg1;
 8166     Register len    = c_rarg2;
 8167     Register nmax  = r4;
 8168     Register base  = r5;
 8169     Register count = r6;
 8170     Register temp0 = rscratch1;
 8171     Register temp1 = rscratch2;
 8172     FloatRegister vbytes = v0;
 8173     FloatRegister vs1acc = v1;
 8174     FloatRegister vs2acc = v2;
 8175     FloatRegister vtable = v3;
 8176 
 8177     // Max number of bytes we can process before having to take the mod
 8178     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 8179     uint64_t BASE = 0xfff1;
 8180     uint64_t NMAX = 0x15B0;
 8181 
 8182     __ mov(base, BASE);
 8183     __ mov(nmax, NMAX);
 8184 
 8185     // Load accumulation coefficients for the upper 16 bits
 8186     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 8187     __ ld1(vtable, __ T16B, Address(temp0));
 8188 
 8189     // s1 is initialized to the lower 16 bits of adler
 8190     // s2 is initialized to the upper 16 bits of adler
 8191     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 8192     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 8193 
 8194     // The pipelined loop needs at least 16 elements for 1 iteration
 8195     // It does check this, but it is more effective to skip to the cleanup loop
 8196     __ cmp(len, (u1)16);
 8197     __ br(Assembler::HS, L_nmax);
 8198     __ cbz(len, L_combine);
 8199 
 8200     __ bind(L_simple_by1_loop);
 8201     __ ldrb(temp0, Address(__ post(buff, 1)));
 8202     __ add(s1, s1, temp0);
 8203     __ add(s2, s2, s1);
 8204     __ subs(len, len, 1);
 8205     __ br(Assembler::HI, L_simple_by1_loop);
 8206 
 8207     // s1 = s1 % BASE
 8208     __ subs(temp0, s1, base);
 8209     __ csel(s1, temp0, s1, Assembler::HS);
 8210 
 8211     // s2 = s2 % BASE
 8212     __ lsr(temp0, s2, 16);
 8213     __ lsl(temp1, temp0, 4);
 8214     __ sub(temp1, temp1, temp0);
 8215     __ add(s2, temp1, s2, ext::uxth);
 8216 
 8217     __ subs(temp0, s2, base);
 8218     __ csel(s2, temp0, s2, Assembler::HS);
 8219 
 8220     __ b(L_combine);
 8221 
 8222     __ bind(L_nmax);
 8223     __ subs(len, len, nmax);
 8224     __ sub(count, nmax, 16);
 8225     __ br(Assembler::LO, L_by16);
 8226 
 8227     __ bind(L_nmax_loop);
 8228 
 8229     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 8230                                       vbytes, vs1acc, vs2acc, vtable);
 8231 
 8232     __ subs(count, count, 16);
 8233     __ br(Assembler::HS, L_nmax_loop);
 8234 
 8235     // s1 = s1 % BASE
 8236     __ lsr(temp0, s1, 16);
 8237     __ lsl(temp1, temp0, 4);
 8238     __ sub(temp1, temp1, temp0);
 8239     __ add(temp1, temp1, s1, ext::uxth);
 8240 
 8241     __ lsr(temp0, temp1, 16);
 8242     __ lsl(s1, temp0, 4);
 8243     __ sub(s1, s1, temp0);
 8244     __ add(s1, s1, temp1, ext:: uxth);
 8245 
 8246     __ subs(temp0, s1, base);
 8247     __ csel(s1, temp0, s1, Assembler::HS);
 8248 
 8249     // s2 = s2 % BASE
 8250     __ lsr(temp0, s2, 16);
 8251     __ lsl(temp1, temp0, 4);
 8252     __ sub(temp1, temp1, temp0);
 8253     __ add(temp1, temp1, s2, ext::uxth);
 8254 
 8255     __ lsr(temp0, temp1, 16);
 8256     __ lsl(s2, temp0, 4);
 8257     __ sub(s2, s2, temp0);
 8258     __ add(s2, s2, temp1, ext:: uxth);
 8259 
 8260     __ subs(temp0, s2, base);
 8261     __ csel(s2, temp0, s2, Assembler::HS);
 8262 
 8263     __ subs(len, len, nmax);
 8264     __ sub(count, nmax, 16);
 8265     __ br(Assembler::HS, L_nmax_loop);
 8266 
 8267     __ bind(L_by16);
 8268     __ adds(len, len, count);
 8269     __ br(Assembler::LO, L_by1);
 8270 
 8271     __ bind(L_by16_loop);
 8272 
 8273     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 8274                                       vbytes, vs1acc, vs2acc, vtable);
 8275 
 8276     __ subs(len, len, 16);
 8277     __ br(Assembler::HS, L_by16_loop);
 8278 
 8279     __ bind(L_by1);
 8280     __ adds(len, len, 15);
 8281     __ br(Assembler::LO, L_do_mod);
 8282 
 8283     __ bind(L_by1_loop);
 8284     __ ldrb(temp0, Address(__ post(buff, 1)));
 8285     __ add(s1, temp0, s1);
 8286     __ add(s2, s2, s1);
 8287     __ subs(len, len, 1);
 8288     __ br(Assembler::HS, L_by1_loop);
 8289 
 8290     __ bind(L_do_mod);
 8291     // s1 = s1 % BASE
 8292     __ lsr(temp0, s1, 16);
 8293     __ lsl(temp1, temp0, 4);
 8294     __ sub(temp1, temp1, temp0);
 8295     __ add(temp1, temp1, s1, ext::uxth);
 8296 
 8297     __ lsr(temp0, temp1, 16);
 8298     __ lsl(s1, temp0, 4);
 8299     __ sub(s1, s1, temp0);
 8300     __ add(s1, s1, temp1, ext:: uxth);
 8301 
 8302     __ subs(temp0, s1, base);
 8303     __ csel(s1, temp0, s1, Assembler::HS);
 8304 
 8305     // s2 = s2 % BASE
 8306     __ lsr(temp0, s2, 16);
 8307     __ lsl(temp1, temp0, 4);
 8308     __ sub(temp1, temp1, temp0);
 8309     __ add(temp1, temp1, s2, ext::uxth);
 8310 
 8311     __ lsr(temp0, temp1, 16);
 8312     __ lsl(s2, temp0, 4);
 8313     __ sub(s2, s2, temp0);
 8314     __ add(s2, s2, temp1, ext:: uxth);
 8315 
 8316     __ subs(temp0, s2, base);
 8317     __ csel(s2, temp0, s2, Assembler::HS);
 8318 
 8319     // Combine lower bits and higher bits
 8320     __ bind(L_combine);
 8321     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 8322 
 8323     __ ret(lr);
 8324 
 8325     // record the stub entry and end
 8326     store_archive_data(stub_id, start, __ pc());
 8327 
 8328     return start;
 8329   }
 8330 
 8331   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 8332           Register temp0, Register temp1, FloatRegister vbytes,
 8333           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 8334     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 8335     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 8336     // In non-vectorized code, we update s1 and s2 as:
 8337     //   s1 <- s1 + b1
 8338     //   s2 <- s2 + s1
 8339     //   s1 <- s1 + b2
 8340     //   s2 <- s2 + b1
 8341     //   ...
 8342     //   s1 <- s1 + b16
 8343     //   s2 <- s2 + s1
 8344     // Putting above assignments together, we have:
 8345     //   s1_new = s1 + b1 + b2 + ... + b16
 8346     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 8347     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 8348     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 8349     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 8350 
 8351     // s2 = s2 + s1 * 16
 8352     __ add(s2, s2, s1, Assembler::LSL, 4);
 8353 
 8354     // vs1acc = b1 + b2 + b3 + ... + b16
 8355     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 8356     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 8357     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 8358     __ uaddlv(vs1acc, __ T16B, vbytes);
 8359     __ uaddlv(vs2acc, __ T8H, vs2acc);
 8360 
 8361     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 8362     __ fmovd(temp0, vs1acc);
 8363     __ fmovd(temp1, vs2acc);
 8364     __ add(s1, s1, temp0);
 8365     __ add(s2, s2, temp1);
 8366   }
 8367 
 8368   /**
 8369    *  Arguments:
 8370    *
 8371    *  Input:
 8372    *    c_rarg0   - x address
 8373    *    c_rarg1   - x length
 8374    *    c_rarg2   - y address
 8375    *    c_rarg3   - y length
 8376    *    c_rarg4   - z address
 8377    */
 8378   address generate_multiplyToLen() {
 8379     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 8380     int entry_count = StubInfo::entry_count(stub_id);
 8381     assert(entry_count == 1, "sanity check");
 8382     address start = load_archive_data(stub_id);
 8383     if (start != nullptr) {
 8384       return start;
 8385     }
 8386     __ align(CodeEntryAlignment);
 8387     StubCodeMark mark(this, stub_id);
 8388 
 8389     start = __ pc();
 8390     const Register x     = r0;
 8391     const Register xlen  = r1;
 8392     const Register y     = r2;
 8393     const Register ylen  = r3;
 8394     const Register z     = r4;
 8395 
 8396     const Register tmp0  = r5;
 8397     const Register tmp1  = r10;
 8398     const Register tmp2  = r11;
 8399     const Register tmp3  = r12;
 8400     const Register tmp4  = r13;
 8401     const Register tmp5  = r14;
 8402     const Register tmp6  = r15;
 8403     const Register tmp7  = r16;
 8404 
 8405     BLOCK_COMMENT("Entry:");
 8406     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8407     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 8408     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8409     __ ret(lr);
 8410 
 8411     // record the stub entry and end
 8412     store_archive_data(stub_id, start, __ pc());
 8413 
 8414     return start;
 8415   }
 8416 
 8417   address generate_squareToLen() {
 8418     // squareToLen algorithm for sizes 1..127 described in java code works
 8419     // faster than multiply_to_len on some CPUs and slower on others, but
 8420     // multiply_to_len shows a bit better overall results
 8421     StubId stub_id = StubId::stubgen_squareToLen_id;
 8422     int entry_count = StubInfo::entry_count(stub_id);
 8423     assert(entry_count == 1, "sanity check");
 8424     address start = load_archive_data(stub_id);
 8425     if (start != nullptr) {
 8426       return start;
 8427     }
 8428     __ align(CodeEntryAlignment);
 8429     StubCodeMark mark(this, stub_id);
 8430     start = __ pc();
 8431 
 8432     const Register x     = r0;
 8433     const Register xlen  = r1;
 8434     const Register z     = r2;
 8435     const Register y     = r4; // == x
 8436     const Register ylen  = r5; // == xlen
 8437 
 8438     const Register tmp0  = r3;
 8439     const Register tmp1  = r10;
 8440     const Register tmp2  = r11;
 8441     const Register tmp3  = r12;
 8442     const Register tmp4  = r13;
 8443     const Register tmp5  = r14;
 8444     const Register tmp6  = r15;
 8445     const Register tmp7  = r16;
 8446 
 8447     RegSet spilled_regs = RegSet::of(y, ylen);
 8448     BLOCK_COMMENT("Entry:");
 8449     __ enter();
 8450     __ push(spilled_regs, sp);
 8451     __ mov(y, x);
 8452     __ mov(ylen, xlen);
 8453     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 8454     __ pop(spilled_regs, sp);
 8455     __ leave();
 8456     __ ret(lr);
 8457 
 8458     // record the stub entry and end
 8459     store_archive_data(stub_id, start, __ pc());
 8460 
 8461     return start;
 8462   }
 8463 
 8464   address generate_mulAdd() {
 8465     StubId stub_id = StubId::stubgen_mulAdd_id;
 8466     int entry_count = StubInfo::entry_count(stub_id);
 8467     assert(entry_count == 1, "sanity check");
 8468     address start = load_archive_data(stub_id);
 8469     if (start != nullptr) {
 8470       return start;
 8471     }
 8472     __ align(CodeEntryAlignment);
 8473     StubCodeMark mark(this, stub_id);
 8474 
 8475     start = __ pc();
 8476 
 8477     const Register out     = r0;
 8478     const Register in      = r1;
 8479     const Register offset  = r2;
 8480     const Register len     = r3;
 8481     const Register k       = r4;
 8482 
 8483     BLOCK_COMMENT("Entry:");
 8484     __ enter();
 8485     __ mul_add(out, in, offset, len, k);
 8486     __ leave();
 8487     __ ret(lr);
 8488 
 8489     // record the stub entry and end
 8490     store_archive_data(stub_id, start, __ pc());
 8491 
 8492     return start;
 8493   }
 8494 
 8495   // Arguments:
 8496   //
 8497   // Input:
 8498   //   c_rarg0   - newArr address
 8499   //   c_rarg1   - oldArr address
 8500   //   c_rarg2   - newIdx
 8501   //   c_rarg3   - shiftCount
 8502   //   c_rarg4   - numIter
 8503   //
 8504   address generate_bigIntegerRightShift() {
 8505     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 8506     int entry_count = StubInfo::entry_count(stub_id);
 8507     assert(entry_count == 1, "sanity check");
 8508     address start = load_archive_data(stub_id);
 8509     if (start != nullptr) {
 8510       return start;
 8511     }
 8512     __ align(CodeEntryAlignment);
 8513     StubCodeMark mark(this, stub_id);
 8514     start = __ pc();
 8515 
 8516     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8517 
 8518     Register newArr        = c_rarg0;
 8519     Register oldArr        = c_rarg1;
 8520     Register newIdx        = c_rarg2;
 8521     Register shiftCount    = c_rarg3;
 8522     Register numIter       = c_rarg4;
 8523     Register idx           = numIter;
 8524 
 8525     Register newArrCur     = rscratch1;
 8526     Register shiftRevCount = rscratch2;
 8527     Register oldArrCur     = r13;
 8528     Register oldArrNext    = r14;
 8529 
 8530     FloatRegister oldElem0        = v0;
 8531     FloatRegister oldElem1        = v1;
 8532     FloatRegister newElem         = v2;
 8533     FloatRegister shiftVCount     = v3;
 8534     FloatRegister shiftVRevCount  = v4;
 8535 
 8536     __ cbz(idx, Exit);
 8537 
 8538     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8539 
 8540     // left shift count
 8541     __ movw(shiftRevCount, 32);
 8542     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8543 
 8544     // numIter too small to allow a 4-words SIMD loop, rolling back
 8545     __ cmp(numIter, (u1)4);
 8546     __ br(Assembler::LT, ShiftThree);
 8547 
 8548     __ dup(shiftVCount,    __ T4S, shiftCount);
 8549     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 8550     __ negr(shiftVCount,   __ T4S, shiftVCount);
 8551 
 8552     __ BIND(ShiftSIMDLoop);
 8553 
 8554     // Calculate the load addresses
 8555     __ sub(idx, idx, 4);
 8556     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8557     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8558     __ add(oldArrCur,  oldArrNext, 4);
 8559 
 8560     // Load 4 words and process
 8561     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 8562     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 8563     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8564     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8565     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8566     __ st1(newElem,   __ T4S,  Address(newArrCur));
 8567 
 8568     __ cmp(idx, (u1)4);
 8569     __ br(Assembler::LT, ShiftTwoLoop);
 8570     __ b(ShiftSIMDLoop);
 8571 
 8572     __ BIND(ShiftTwoLoop);
 8573     __ cbz(idx, Exit);
 8574     __ cmp(idx, (u1)1);
 8575     __ br(Assembler::EQ, ShiftOne);
 8576 
 8577     // Calculate the load addresses
 8578     __ sub(idx, idx, 2);
 8579     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8580     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8581     __ add(oldArrCur,  oldArrNext, 4);
 8582 
 8583     // Load 2 words and process
 8584     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 8585     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 8586     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 8587     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 8588     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 8589     __ st1(newElem,   __ T2S, Address(newArrCur));
 8590     __ b(ShiftTwoLoop);
 8591 
 8592     __ BIND(ShiftThree);
 8593     __ tbz(idx, 1, ShiftOne);
 8594     __ tbz(idx, 0, ShiftTwo);
 8595     __ ldrw(r10,  Address(oldArr, 12));
 8596     __ ldrw(r11,  Address(oldArr, 8));
 8597     __ lsrvw(r10, r10, shiftCount);
 8598     __ lslvw(r11, r11, shiftRevCount);
 8599     __ orrw(r12,  r10, r11);
 8600     __ strw(r12,  Address(newArr, 8));
 8601 
 8602     __ BIND(ShiftTwo);
 8603     __ ldrw(r10,  Address(oldArr, 8));
 8604     __ ldrw(r11,  Address(oldArr, 4));
 8605     __ lsrvw(r10, r10, shiftCount);
 8606     __ lslvw(r11, r11, shiftRevCount);
 8607     __ orrw(r12,  r10, r11);
 8608     __ strw(r12,  Address(newArr, 4));
 8609 
 8610     __ BIND(ShiftOne);
 8611     __ ldrw(r10,  Address(oldArr, 4));
 8612     __ ldrw(r11,  Address(oldArr));
 8613     __ lsrvw(r10, r10, shiftCount);
 8614     __ lslvw(r11, r11, shiftRevCount);
 8615     __ orrw(r12,  r10, r11);
 8616     __ strw(r12,  Address(newArr));
 8617 
 8618     __ BIND(Exit);
 8619     __ ret(lr);
 8620 
 8621     // record the stub entry and end
 8622     store_archive_data(stub_id, start, __ pc());
 8623 
 8624     return start;
 8625   }
 8626 
 8627   // Arguments:
 8628   //
 8629   // Input:
 8630   //   c_rarg0   - newArr address
 8631   //   c_rarg1   - oldArr address
 8632   //   c_rarg2   - newIdx
 8633   //   c_rarg3   - shiftCount
 8634   //   c_rarg4   - numIter
 8635   //
 8636   address generate_bigIntegerLeftShift() {
 8637     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8638     int entry_count = StubInfo::entry_count(stub_id);
 8639     assert(entry_count == 1, "sanity check");
 8640     address start = load_archive_data(stub_id);
 8641     if (start != nullptr) {
 8642       return start;
 8643     }
 8644     __ align(CodeEntryAlignment);
 8645     StubCodeMark mark(this, stub_id);
 8646     start = __ pc();
 8647 
 8648     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8649 
 8650     Register newArr        = c_rarg0;
 8651     Register oldArr        = c_rarg1;
 8652     Register newIdx        = c_rarg2;
 8653     Register shiftCount    = c_rarg3;
 8654     Register numIter       = c_rarg4;
 8655 
 8656     Register shiftRevCount = rscratch1;
 8657     Register oldArrNext    = rscratch2;
 8658 
 8659     FloatRegister oldElem0        = v0;
 8660     FloatRegister oldElem1        = v1;
 8661     FloatRegister newElem         = v2;
 8662     FloatRegister shiftVCount     = v3;
 8663     FloatRegister shiftVRevCount  = v4;
 8664 
 8665     __ cbz(numIter, Exit);
 8666 
 8667     __ add(oldArrNext, oldArr, 4);
 8668     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8669 
 8670     // right shift count
 8671     __ movw(shiftRevCount, 32);
 8672     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8673 
 8674     // numIter too small to allow a 4-words SIMD loop, rolling back
 8675     __ cmp(numIter, (u1)4);
 8676     __ br(Assembler::LT, ShiftThree);
 8677 
 8678     __ dup(shiftVCount,     __ T4S, shiftCount);
 8679     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8680     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8681 
 8682     __ BIND(ShiftSIMDLoop);
 8683 
 8684     // load 4 words and process
 8685     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8686     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8687     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8688     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8689     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8690     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8691     __ sub(numIter,   numIter, 4);
 8692 
 8693     __ cmp(numIter, (u1)4);
 8694     __ br(Assembler::LT, ShiftTwoLoop);
 8695     __ b(ShiftSIMDLoop);
 8696 
 8697     __ BIND(ShiftTwoLoop);
 8698     __ cbz(numIter, Exit);
 8699     __ cmp(numIter, (u1)1);
 8700     __ br(Assembler::EQ, ShiftOne);
 8701 
 8702     // load 2 words and process
 8703     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8704     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8705     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8706     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8707     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8708     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8709     __ sub(numIter,   numIter, 2);
 8710     __ b(ShiftTwoLoop);
 8711 
 8712     __ BIND(ShiftThree);
 8713     __ ldrw(r10,  __ post(oldArr, 4));
 8714     __ ldrw(r11,  __ post(oldArrNext, 4));
 8715     __ lslvw(r10, r10, shiftCount);
 8716     __ lsrvw(r11, r11, shiftRevCount);
 8717     __ orrw(r12,  r10, r11);
 8718     __ strw(r12,  __ post(newArr, 4));
 8719     __ tbz(numIter, 1, Exit);
 8720     __ tbz(numIter, 0, ShiftOne);
 8721 
 8722     __ BIND(ShiftTwo);
 8723     __ ldrw(r10,  __ post(oldArr, 4));
 8724     __ ldrw(r11,  __ post(oldArrNext, 4));
 8725     __ lslvw(r10, r10, shiftCount);
 8726     __ lsrvw(r11, r11, shiftRevCount);
 8727     __ orrw(r12,  r10, r11);
 8728     __ strw(r12,  __ post(newArr, 4));
 8729 
 8730     __ BIND(ShiftOne);
 8731     __ ldrw(r10,  Address(oldArr));
 8732     __ ldrw(r11,  Address(oldArrNext));
 8733     __ lslvw(r10, r10, shiftCount);
 8734     __ lsrvw(r11, r11, shiftRevCount);
 8735     __ orrw(r12,  r10, r11);
 8736     __ strw(r12,  Address(newArr));
 8737 
 8738     __ BIND(Exit);
 8739     __ ret(lr);
 8740 
 8741     // record the stub entry and end
 8742     store_archive_data(stub_id, start, __ pc());
 8743 
 8744     return start;
 8745   }
 8746 
 8747   address generate_count_positives(address &count_positives_long) {
 8748     StubId stub_id = StubId::stubgen_count_positives_id;
 8749     GrowableArray<address> entries;
 8750     int entry_count = StubInfo::entry_count(stub_id);
 8751     // We have an extra entry for count_positives_long.
 8752     assert(entry_count == 2, "sanity check");
 8753     address start = load_archive_data(stub_id, &entries);
 8754     if (start != nullptr) {
 8755       assert(entries.length() == 1,
 8756              "unexpected extra entry count %d", entries.length());
 8757       count_positives_long = entries.at(0);
 8758       return start;
 8759     }
 8760     const u1 large_loop_size = 64;
 8761     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8762     int dcache_line = VM_Version::dcache_line_size();
 8763 
 8764     Register ary1 = r1, len = r2, result = r0;
 8765 
 8766     __ align(CodeEntryAlignment);
 8767     StubCodeMark mark(this, stub_id);
 8768 
 8769     address entry = __ pc();
 8770 
 8771     __ enter();
 8772     // precondition: a copy of len is already in result
 8773     // __ mov(result, len);
 8774 
 8775   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8776         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8777 
 8778   __ cmp(len, (u1)15);
 8779   __ br(Assembler::GT, LEN_OVER_15);
 8780   // The only case when execution falls into this code is when pointer is near
 8781   // the end of memory page and we have to avoid reading next page
 8782   __ add(ary1, ary1, len);
 8783   __ subs(len, len, 8);
 8784   __ br(Assembler::GT, LEN_OVER_8);
 8785   __ ldr(rscratch2, Address(ary1, -8));
 8786   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8787   __ lsrv(rscratch2, rscratch2, rscratch1);
 8788   __ tst(rscratch2, UPPER_BIT_MASK);
 8789   __ csel(result, zr, result, Assembler::NE);
 8790   __ leave();
 8791   __ ret(lr);
 8792   __ bind(LEN_OVER_8);
 8793   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8794   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8795   __ tst(rscratch2, UPPER_BIT_MASK);
 8796   __ br(Assembler::NE, RET_NO_POP);
 8797   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8798   __ lsrv(rscratch1, rscratch1, rscratch2);
 8799   __ tst(rscratch1, UPPER_BIT_MASK);
 8800   __ bind(RET_NO_POP);
 8801   __ csel(result, zr, result, Assembler::NE);
 8802   __ leave();
 8803   __ ret(lr);
 8804 
 8805   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8806   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8807 
 8808   count_positives_long = __ pc(); // 2nd entry point
 8809   entries.append(count_positives_long);
 8810 
 8811   __ enter();
 8812 
 8813   __ bind(LEN_OVER_15);
 8814     __ push(spilled_regs, sp);
 8815     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8816     __ cbz(rscratch2, ALIGNED);
 8817     __ ldp(tmp6, tmp1, Address(ary1));
 8818     __ mov(tmp5, 16);
 8819     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8820     __ add(ary1, ary1, rscratch1);
 8821     __ orr(tmp6, tmp6, tmp1);
 8822     __ tst(tmp6, UPPER_BIT_MASK);
 8823     __ br(Assembler::NE, RET_ADJUST);
 8824     __ sub(len, len, rscratch1);
 8825 
 8826   __ bind(ALIGNED);
 8827     __ cmp(len, large_loop_size);
 8828     __ br(Assembler::LT, CHECK_16);
 8829     // Perform 16-byte load as early return in pre-loop to handle situation
 8830     // when initially aligned large array has negative values at starting bytes,
 8831     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8832     // slower. Cases with negative bytes further ahead won't be affected that
 8833     // much. In fact, it'll be faster due to early loads, less instructions and
 8834     // less branches in LARGE_LOOP.
 8835     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8836     __ sub(len, len, 16);
 8837     __ orr(tmp6, tmp6, tmp1);
 8838     __ tst(tmp6, UPPER_BIT_MASK);
 8839     __ br(Assembler::NE, RET_ADJUST_16);
 8840     __ cmp(len, large_loop_size);
 8841     __ br(Assembler::LT, CHECK_16);
 8842 
 8843     if (SoftwarePrefetchHintDistance >= 0
 8844         && SoftwarePrefetchHintDistance >= dcache_line) {
 8845       // initial prefetch
 8846       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8847     }
 8848   __ bind(LARGE_LOOP);
 8849     if (SoftwarePrefetchHintDistance >= 0) {
 8850       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8851     }
 8852     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8853     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8854     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8855     // instructions per cycle and have less branches, but this approach disables
 8856     // early return, thus, all 64 bytes are loaded and checked every time.
 8857     __ ldp(tmp2, tmp3, Address(ary1));
 8858     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8859     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8860     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8861     __ add(ary1, ary1, large_loop_size);
 8862     __ sub(len, len, large_loop_size);
 8863     __ orr(tmp2, tmp2, tmp3);
 8864     __ orr(tmp4, tmp4, tmp5);
 8865     __ orr(rscratch1, rscratch1, rscratch2);
 8866     __ orr(tmp6, tmp6, tmp1);
 8867     __ orr(tmp2, tmp2, tmp4);
 8868     __ orr(rscratch1, rscratch1, tmp6);
 8869     __ orr(tmp2, tmp2, rscratch1);
 8870     __ tst(tmp2, UPPER_BIT_MASK);
 8871     __ br(Assembler::NE, RET_ADJUST_LONG);
 8872     __ cmp(len, large_loop_size);
 8873     __ br(Assembler::GE, LARGE_LOOP);
 8874 
 8875   __ bind(CHECK_16); // small 16-byte load pre-loop
 8876     __ cmp(len, (u1)16);
 8877     __ br(Assembler::LT, POST_LOOP16);
 8878 
 8879   __ bind(LOOP16); // small 16-byte load loop
 8880     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8881     __ sub(len, len, 16);
 8882     __ orr(tmp2, tmp2, tmp3);
 8883     __ tst(tmp2, UPPER_BIT_MASK);
 8884     __ br(Assembler::NE, RET_ADJUST_16);
 8885     __ cmp(len, (u1)16);
 8886     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8887 
 8888   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8889     __ cmp(len, (u1)8);
 8890     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8891     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8892     __ tst(tmp3, UPPER_BIT_MASK);
 8893     __ br(Assembler::NE, RET_ADJUST);
 8894     __ sub(len, len, 8);
 8895 
 8896   __ bind(POST_LOOP16_LOAD_TAIL);
 8897     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8898     __ ldr(tmp1, Address(ary1));
 8899     __ mov(tmp2, 64);
 8900     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8901     __ lslv(tmp1, tmp1, tmp4);
 8902     __ tst(tmp1, UPPER_BIT_MASK);
 8903     __ br(Assembler::NE, RET_ADJUST);
 8904     // Fallthrough
 8905 
 8906   __ bind(RET_LEN);
 8907     __ pop(spilled_regs, sp);
 8908     __ leave();
 8909     __ ret(lr);
 8910 
 8911     // difference result - len is the count of guaranteed to be
 8912     // positive bytes
 8913 
 8914   __ bind(RET_ADJUST_LONG);
 8915     __ add(len, len, (u1)(large_loop_size - 16));
 8916   __ bind(RET_ADJUST_16);
 8917     __ add(len, len, 16);
 8918   __ bind(RET_ADJUST);
 8919     __ pop(spilled_regs, sp);
 8920     __ leave();
 8921     __ sub(result, result, len);
 8922     __ ret(lr);
 8923 
 8924     // record the stub entry and end plus the extra entry
 8925     store_archive_data(stub_id, entry, __ pc(), &entries);
 8926 
 8927     return entry;
 8928   }
 8929 
 8930   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8931         bool usePrefetch, Label &NOT_EQUAL) {
 8932     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8933         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8934         tmp7 = r12, tmp8 = r13;
 8935     Label LOOP;
 8936 
 8937     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8938     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8939     __ bind(LOOP);
 8940     if (usePrefetch) {
 8941       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8942       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8943     }
 8944     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8945     __ eor(tmp1, tmp1, tmp2);
 8946     __ eor(tmp3, tmp3, tmp4);
 8947     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8948     __ orr(tmp1, tmp1, tmp3);
 8949     __ cbnz(tmp1, NOT_EQUAL);
 8950     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8951     __ eor(tmp5, tmp5, tmp6);
 8952     __ eor(tmp7, tmp7, tmp8);
 8953     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8954     __ orr(tmp5, tmp5, tmp7);
 8955     __ cbnz(tmp5, NOT_EQUAL);
 8956     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8957     __ eor(tmp1, tmp1, tmp2);
 8958     __ eor(tmp3, tmp3, tmp4);
 8959     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8960     __ orr(tmp1, tmp1, tmp3);
 8961     __ cbnz(tmp1, NOT_EQUAL);
 8962     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8963     __ eor(tmp5, tmp5, tmp6);
 8964     __ sub(cnt1, cnt1, 8 * wordSize);
 8965     __ eor(tmp7, tmp7, tmp8);
 8966     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8967     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8968     // cmp) because subs allows an unlimited range of immediate operand.
 8969     __ subs(tmp6, cnt1, loopThreshold);
 8970     __ orr(tmp5, tmp5, tmp7);
 8971     __ cbnz(tmp5, NOT_EQUAL);
 8972     __ br(__ GE, LOOP);
 8973     // post-loop
 8974     __ eor(tmp1, tmp1, tmp2);
 8975     __ eor(tmp3, tmp3, tmp4);
 8976     __ orr(tmp1, tmp1, tmp3);
 8977     __ sub(cnt1, cnt1, 2 * wordSize);
 8978     __ cbnz(tmp1, NOT_EQUAL);
 8979   }
 8980 
 8981   void generate_large_array_equals_loop_simd(int loopThreshold,
 8982         bool usePrefetch, Label &NOT_EQUAL) {
 8983     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8984         tmp2 = rscratch2;
 8985     Label LOOP;
 8986 
 8987     __ bind(LOOP);
 8988     if (usePrefetch) {
 8989       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8990       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8991     }
 8992     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8993     __ sub(cnt1, cnt1, 8 * wordSize);
 8994     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8995     __ subs(tmp1, cnt1, loopThreshold);
 8996     __ eor(v0, __ T16B, v0, v4);
 8997     __ eor(v1, __ T16B, v1, v5);
 8998     __ eor(v2, __ T16B, v2, v6);
 8999     __ eor(v3, __ T16B, v3, v7);
 9000     __ orr(v0, __ T16B, v0, v1);
 9001     __ orr(v1, __ T16B, v2, v3);
 9002     __ orr(v0, __ T16B, v0, v1);
 9003     __ umov(tmp1, v0, __ D, 0);
 9004     __ umov(tmp2, v0, __ D, 1);
 9005     __ orr(tmp1, tmp1, tmp2);
 9006     __ cbnz(tmp1, NOT_EQUAL);
 9007     __ br(__ GE, LOOP);
 9008   }
 9009 
 9010   // a1 = r1 - array1 address
 9011   // a2 = r2 - array2 address
 9012   // result = r0 - return value. Already contains "false"
 9013   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 9014   // r3-r5 are reserved temporary registers
 9015   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 9016   address generate_large_array_equals() {
 9017     StubId stub_id = StubId::stubgen_large_array_equals_id;
 9018     int entry_count = StubInfo::entry_count(stub_id);
 9019     assert(entry_count == 1, "sanity check");
 9020     address start = load_archive_data(stub_id);
 9021     if (start != nullptr) {
 9022       return start;
 9023     }
 9024     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 9025         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 9026         tmp7 = r12, tmp8 = r13;
 9027     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 9028         SMALL_LOOP, POST_LOOP;
 9029     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 9030     // calculate if at least 32 prefetched bytes are used
 9031     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 9032     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 9033     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 9034     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 9035         tmp5, tmp6, tmp7, tmp8);
 9036 
 9037     __ align(CodeEntryAlignment);
 9038 
 9039     StubCodeMark mark(this, stub_id);
 9040 
 9041     address entry = __ pc();
 9042     __ enter();
 9043     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 9044     // also advance pointers to use post-increment instead of pre-increment
 9045     __ add(a1, a1, wordSize);
 9046     __ add(a2, a2, wordSize);
 9047     if (AvoidUnalignedAccesses) {
 9048       // both implementations (SIMD/nonSIMD) are using relatively large load
 9049       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 9050       // on some CPUs in case of address is not at least 16-byte aligned.
 9051       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 9052       // load if needed at least for 1st address and make if 16-byte aligned.
 9053       Label ALIGNED16;
 9054       __ tbz(a1, 3, ALIGNED16);
 9055       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 9056       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 9057       __ sub(cnt1, cnt1, wordSize);
 9058       __ eor(tmp1, tmp1, tmp2);
 9059       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 9060       __ bind(ALIGNED16);
 9061     }
 9062     if (UseSIMDForArrayEquals) {
 9063       if (SoftwarePrefetchHintDistance >= 0) {
 9064         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 9065         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 9066         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 9067             /* prfm = */ true, NOT_EQUAL);
 9068         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 9069         __ br(__ LT, TAIL);
 9070       }
 9071       __ bind(NO_PREFETCH_LARGE_LOOP);
 9072       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 9073           /* prfm = */ false, NOT_EQUAL);
 9074     } else {
 9075       __ push(spilled_regs, sp);
 9076       if (SoftwarePrefetchHintDistance >= 0) {
 9077         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 9078         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 9079         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 9080             /* prfm = */ true, NOT_EQUAL);
 9081         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 9082         __ br(__ LT, TAIL);
 9083       }
 9084       __ bind(NO_PREFETCH_LARGE_LOOP);
 9085       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 9086           /* prfm = */ false, NOT_EQUAL);
 9087     }
 9088     __ bind(TAIL);
 9089       __ cbz(cnt1, EQUAL);
 9090       __ subs(cnt1, cnt1, wordSize);
 9091       __ br(__ LE, POST_LOOP);
 9092     __ bind(SMALL_LOOP);
 9093       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 9094       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 9095       __ subs(cnt1, cnt1, wordSize);
 9096       __ eor(tmp1, tmp1, tmp2);
 9097       __ cbnz(tmp1, NOT_EQUAL);
 9098       __ br(__ GT, SMALL_LOOP);
 9099     __ bind(POST_LOOP);
 9100       __ ldr(tmp1, Address(a1, cnt1));
 9101       __ ldr(tmp2, Address(a2, cnt1));
 9102       __ eor(tmp1, tmp1, tmp2);
 9103       __ cbnz(tmp1, NOT_EQUAL);
 9104     __ bind(EQUAL);
 9105       __ mov(result, true);
 9106     __ bind(NOT_EQUAL);
 9107       if (!UseSIMDForArrayEquals) {
 9108         __ pop(spilled_regs, sp);
 9109       }
 9110     __ bind(NOT_EQUAL_NO_POP);
 9111     __ leave();
 9112     __ ret(lr);
 9113 
 9114     // record the stub entry and end
 9115     store_archive_data(stub_id, entry, __ pc());
 9116 
 9117     return entry;
 9118   }
 9119 
 9120   // result = r0 - return value. Contains initial hashcode value on entry.
 9121   // ary = r1 - array address
 9122   // cnt = r2 - elements count
 9123   // Clobbers: v0-v13, rscratch1, rscratch2
 9124   address generate_large_arrays_hashcode(BasicType eltype) {
 9125     StubId stub_id;
 9126     switch (eltype) {
 9127     case T_BOOLEAN:
 9128       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 9129       break;
 9130     case T_BYTE:
 9131       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 9132       break;
 9133     case T_CHAR:
 9134       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 9135       break;
 9136     case T_SHORT:
 9137       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 9138       break;
 9139     case T_INT:
 9140       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 9141       break;
 9142     default:
 9143       stub_id = StubId::NO_STUBID;
 9144       ShouldNotReachHere();
 9145     };
 9146     int entry_count = StubInfo::entry_count(stub_id);
 9147     assert(entry_count == 1, "sanity check");
 9148     address start = load_archive_data(stub_id);
 9149     if (start != nullptr) {
 9150       return start;
 9151     }
 9152     const Register result = r0, ary = r1, cnt = r2;
 9153     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 9154     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 9155     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 9156     const FloatRegister vpowm = v13;
 9157 
 9158     ARRAYS_HASHCODE_REGISTERS;
 9159 
 9160     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 9161 
 9162     unsigned int vf; // vectorization factor
 9163     bool multiply_by_halves;
 9164     Assembler::SIMD_Arrangement load_arrangement;
 9165     switch (eltype) {
 9166     case T_BOOLEAN:
 9167     case T_BYTE:
 9168       load_arrangement = Assembler::T8B;
 9169       multiply_by_halves = true;
 9170       vf = 8;
 9171       break;
 9172     case T_CHAR:
 9173     case T_SHORT:
 9174       load_arrangement = Assembler::T8H;
 9175       multiply_by_halves = true;
 9176       vf = 8;
 9177       break;
 9178     case T_INT:
 9179       load_arrangement = Assembler::T4S;
 9180       multiply_by_halves = false;
 9181       vf = 4;
 9182       break;
 9183     default:
 9184       ShouldNotReachHere();
 9185     }
 9186 
 9187     // Unroll factor
 9188     const unsigned uf = 4;
 9189 
 9190     // Effective vectorization factor
 9191     const unsigned evf = vf * uf;
 9192 
 9193     __ align(CodeEntryAlignment);
 9194 
 9195     StubCodeMark mark(this, stub_id);
 9196 
 9197     address entry = __ pc();
 9198     __ enter();
 9199 
 9200     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 9201     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 9202     // value shouldn't change throughout both loops.
 9203     __ movw(rscratch1, intpow(31U, 3));
 9204     __ mov(vpow, Assembler::S, 0, rscratch1);
 9205     __ movw(rscratch1, intpow(31U, 2));
 9206     __ mov(vpow, Assembler::S, 1, rscratch1);
 9207     __ movw(rscratch1, intpow(31U, 1));
 9208     __ mov(vpow, Assembler::S, 2, rscratch1);
 9209     __ movw(rscratch1, intpow(31U, 0));
 9210     __ mov(vpow, Assembler::S, 3, rscratch1);
 9211 
 9212     __ mov(vmul0, Assembler::T16B, 0);
 9213     __ mov(vmul0, Assembler::S, 3, result);
 9214 
 9215     __ andr(rscratch2, cnt, (uf - 1) * vf);
 9216     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 9217 
 9218     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 9219     __ mov(vpowm, Assembler::S, 0, rscratch1);
 9220 
 9221     // SMALL LOOP
 9222     __ bind(SMALL_LOOP);
 9223 
 9224     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 9225     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9226     __ subsw(rscratch2, rscratch2, vf);
 9227 
 9228     if (load_arrangement == Assembler::T8B) {
 9229       // Extend 8B to 8H to be able to use vector multiply
 9230       // instructions
 9231       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 9232       if (is_signed_subword_type(eltype)) {
 9233         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9234       } else {
 9235         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9236       }
 9237     }
 9238 
 9239     switch (load_arrangement) {
 9240     case Assembler::T4S:
 9241       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 9242       break;
 9243     case Assembler::T8B:
 9244     case Assembler::T8H:
 9245       assert(is_subword_type(eltype), "subword type expected");
 9246       if (is_signed_subword_type(eltype)) {
 9247         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9248       } else {
 9249         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9250       }
 9251       break;
 9252     default:
 9253       __ should_not_reach_here();
 9254     }
 9255 
 9256     // Process the upper half of a vector
 9257     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 9258       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9259       if (is_signed_subword_type(eltype)) {
 9260         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9261       } else {
 9262         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9263       }
 9264     }
 9265 
 9266     __ br(Assembler::HI, SMALL_LOOP);
 9267 
 9268     // SMALL LOOP'S EPILOQUE
 9269     __ lsr(rscratch2, cnt, exact_log2(evf));
 9270     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 9271 
 9272     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 9273     __ addv(vmul0, Assembler::T4S, vmul0);
 9274     __ umov(result, vmul0, Assembler::S, 0);
 9275 
 9276     // TAIL
 9277     __ bind(TAIL);
 9278 
 9279     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 9280     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 9281     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 9282     __ andr(rscratch2, cnt, vf - 1);
 9283     __ bind(TAIL_SHORTCUT);
 9284     __ adr(rscratch1, BR_BASE);
 9285     // For Cortex-A53 offset is 4 because 2 nops are generated.
 9286     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 9287     __ movw(rscratch2, 0x1f);
 9288     __ br(rscratch1);
 9289 
 9290     for (size_t i = 0; i < vf - 1; ++i) {
 9291       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 9292                                    eltype);
 9293       __ maddw(result, result, rscratch2, rscratch1);
 9294       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 9295       // Generate 2nd nop to have 4 instructions per iteration.
 9296       if (VM_Version::supports_a53mac()) {
 9297         __ nop();
 9298       }
 9299     }
 9300     __ bind(BR_BASE);
 9301 
 9302     __ leave();
 9303     __ ret(lr);
 9304 
 9305     // LARGE LOOP
 9306     __ bind(LARGE_LOOP_PREHEADER);
 9307 
 9308     __ lsr(rscratch2, cnt, exact_log2(evf));
 9309 
 9310     if (multiply_by_halves) {
 9311       // 31^4 - multiplier between lower and upper parts of a register
 9312       __ movw(rscratch1, intpow(31U, vf / 2));
 9313       __ mov(vpowm, Assembler::S, 1, rscratch1);
 9314       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 9315       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 9316       __ mov(vpowm, Assembler::S, 0, rscratch1);
 9317     } else {
 9318       // 31^16
 9319       __ movw(rscratch1, intpow(31U, evf));
 9320       __ mov(vpowm, Assembler::S, 0, rscratch1);
 9321     }
 9322 
 9323     __ mov(vmul3, Assembler::T16B, 0);
 9324     __ mov(vmul2, Assembler::T16B, 0);
 9325     __ mov(vmul1, Assembler::T16B, 0);
 9326 
 9327     __ bind(LARGE_LOOP);
 9328 
 9329     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 9330     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 9331     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 9332     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9333 
 9334     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 9335            Address(__ post(ary, evf * type2aelembytes(eltype))));
 9336 
 9337     if (load_arrangement == Assembler::T8B) {
 9338       // Extend 8B to 8H to be able to use vector multiply
 9339       // instructions
 9340       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 9341       if (is_signed_subword_type(eltype)) {
 9342         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 9343         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 9344         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 9345         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9346       } else {
 9347         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 9348         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 9349         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 9350         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9351       }
 9352     }
 9353 
 9354     switch (load_arrangement) {
 9355     case Assembler::T4S:
 9356       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 9357       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 9358       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 9359       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 9360       break;
 9361     case Assembler::T8B:
 9362     case Assembler::T8H:
 9363       assert(is_subword_type(eltype), "subword type expected");
 9364       if (is_signed_subword_type(eltype)) {
 9365         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 9366         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 9367         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 9368         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9369       } else {
 9370         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 9371         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 9372         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 9373         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9374       }
 9375       break;
 9376     default:
 9377       __ should_not_reach_here();
 9378     }
 9379 
 9380     // Process the upper half of a vector
 9381     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 9382       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 9383       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 9384       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 9385       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 9386       if (is_signed_subword_type(eltype)) {
 9387         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 9388         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 9389         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 9390         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9391       } else {
 9392         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 9393         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 9394         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 9395         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9396       }
 9397     }
 9398 
 9399     __ subsw(rscratch2, rscratch2, 1);
 9400     __ br(Assembler::HI, LARGE_LOOP);
 9401 
 9402     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 9403     __ addv(vmul3, Assembler::T4S, vmul3);
 9404     __ umov(result, vmul3, Assembler::S, 0);
 9405 
 9406     __ mov(rscratch2, intpow(31U, vf));
 9407 
 9408     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 9409     __ addv(vmul2, Assembler::T4S, vmul2);
 9410     __ umov(rscratch1, vmul2, Assembler::S, 0);
 9411     __ maddw(result, result, rscratch2, rscratch1);
 9412 
 9413     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 9414     __ addv(vmul1, Assembler::T4S, vmul1);
 9415     __ umov(rscratch1, vmul1, Assembler::S, 0);
 9416     __ maddw(result, result, rscratch2, rscratch1);
 9417 
 9418     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 9419     __ addv(vmul0, Assembler::T4S, vmul0);
 9420     __ umov(rscratch1, vmul0, Assembler::S, 0);
 9421     __ maddw(result, result, rscratch2, rscratch1);
 9422 
 9423     __ andr(rscratch2, cnt, vf - 1);
 9424     __ cbnz(rscratch2, TAIL_SHORTCUT);
 9425 
 9426     __ leave();
 9427     __ ret(lr);
 9428 
 9429     // record the stub entry and end
 9430     store_archive_data(stub_id, entry, __ pc());
 9431 
 9432     return entry;
 9433   }
 9434 
 9435   address generate_dsin_dcos(bool isCos) {
 9436     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 9437     int entry_count = StubInfo::entry_count(stub_id);
 9438     assert(entry_count == 1, "sanity check");
 9439     address start = load_archive_data(stub_id);
 9440     if (start != nullptr) {
 9441       return start;
 9442     }
 9443     __ align(CodeEntryAlignment);
 9444     StubCodeMark mark(this, stub_id);
 9445     start = __ pc();
 9446     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 9447         (address)StubRoutines::aarch64::_two_over_pi,
 9448         (address)StubRoutines::aarch64::_pio2,
 9449         (address)StubRoutines::aarch64::_dsin_coef,
 9450         (address)StubRoutines::aarch64::_dcos_coef);
 9451 
 9452     // record the stub entry and end
 9453     store_archive_data(stub_id, start, __ pc());
 9454 
 9455     return start;
 9456   }
 9457 
 9458   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 9459   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 9460       Label &DIFF2) {
 9461     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 9462     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 9463 
 9464     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 9465     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 9466     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 9467     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 9468 
 9469     __ fmovd(tmpL, vtmp3);
 9470     __ eor(rscratch2, tmp3, tmpL);
 9471     __ cbnz(rscratch2, DIFF2);
 9472 
 9473     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9474     __ umov(tmpL, vtmp3, __ D, 1);
 9475     __ eor(rscratch2, tmpU, tmpL);
 9476     __ cbnz(rscratch2, DIFF1);
 9477 
 9478     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 9479     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 9480     __ fmovd(tmpL, vtmp);
 9481     __ eor(rscratch2, tmp3, tmpL);
 9482     __ cbnz(rscratch2, DIFF2);
 9483 
 9484     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9485     __ umov(tmpL, vtmp, __ D, 1);
 9486     __ eor(rscratch2, tmpU, tmpL);
 9487     __ cbnz(rscratch2, DIFF1);
 9488   }
 9489 
 9490   // r0  = result
 9491   // r1  = str1
 9492   // r2  = cnt1
 9493   // r3  = str2
 9494   // r4  = cnt2
 9495   // r10 = tmp1
 9496   // r11 = tmp2
 9497   address generate_compare_long_string_different_encoding(bool isLU) {
 9498     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 9499     int entry_count = StubInfo::entry_count(stub_id);
 9500     assert(entry_count == 1, "sanity check");
 9501     address start = load_archive_data(stub_id);
 9502     if (start != nullptr) {
 9503       return start;
 9504     }
 9505     __ align(CodeEntryAlignment);
 9506     StubCodeMark mark(this, stub_id);
 9507     address entry = __ pc();
 9508     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 9509         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 9510         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 9511     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9512         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 9513     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 9514     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 9515 
 9516     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 9517 
 9518     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 9519     // cnt2 == amount of characters left to compare
 9520     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 9521     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 9522     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 9523     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 9524     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 9525     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 9526     __ eor(rscratch2, tmp1, tmp2);
 9527     __ mov(rscratch1, tmp2);
 9528     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 9529     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 9530              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 9531     __ push(spilled_regs, sp);
 9532     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 9533     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 9534 
 9535     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9536 
 9537     if (SoftwarePrefetchHintDistance >= 0) {
 9538       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 9539       __ br(__ LT, NO_PREFETCH);
 9540       __ bind(LARGE_LOOP_PREFETCH);
 9541         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 9542         __ mov(tmp4, 2);
 9543         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 9544         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 9545           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9546           __ subs(tmp4, tmp4, 1);
 9547           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 9548           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 9549           __ mov(tmp4, 2);
 9550         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 9551           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9552           __ subs(tmp4, tmp4, 1);
 9553           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 9554           __ sub(cnt2, cnt2, 64);
 9555           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 9556           __ br(__ GE, LARGE_LOOP_PREFETCH);
 9557     }
 9558     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 9559     __ bind(NO_PREFETCH);
 9560     __ subs(cnt2, cnt2, 16);
 9561     __ br(__ LT, TAIL);
 9562     __ align(OptoLoopAlignment);
 9563     __ bind(SMALL_LOOP); // smaller loop
 9564       __ subs(cnt2, cnt2, 16);
 9565       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9566       __ br(__ GE, SMALL_LOOP);
 9567       __ cmn(cnt2, (u1)16);
 9568       __ br(__ EQ, LOAD_LAST);
 9569     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 9570       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 9571       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 9572       __ ldr(tmp3, Address(cnt1, -8));
 9573       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 9574       __ b(LOAD_LAST);
 9575     __ bind(DIFF2);
 9576       __ mov(tmpU, tmp3);
 9577     __ bind(DIFF1);
 9578       __ pop(spilled_regs, sp);
 9579       __ b(CALCULATE_DIFFERENCE);
 9580     __ bind(LOAD_LAST);
 9581       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 9582       // No need to load it again
 9583       __ mov(tmpU, tmp3);
 9584       __ pop(spilled_regs, sp);
 9585 
 9586       // tmp2 points to the address of the last 4 Latin1 characters right now
 9587       __ ldrs(vtmp, Address(tmp2));
 9588       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 9589       __ fmovd(tmpL, vtmp);
 9590 
 9591       __ eor(rscratch2, tmpU, tmpL);
 9592       __ cbz(rscratch2, DONE);
 9593 
 9594     // Find the first different characters in the longwords and
 9595     // compute their difference.
 9596     __ bind(CALCULATE_DIFFERENCE);
 9597       __ rev(rscratch2, rscratch2);
 9598       __ clz(rscratch2, rscratch2);
 9599       __ andr(rscratch2, rscratch2, -16);
 9600       __ lsrv(tmp1, tmp1, rscratch2);
 9601       __ uxthw(tmp1, tmp1);
 9602       __ lsrv(rscratch1, rscratch1, rscratch2);
 9603       __ uxthw(rscratch1, rscratch1);
 9604       __ subw(result, tmp1, rscratch1);
 9605     __ bind(DONE);
 9606       __ ret(lr);
 9607 
 9608       // record the stub entry and end
 9609       store_archive_data(stub_id, entry, __ pc());
 9610 
 9611       return entry;
 9612   }
 9613 
 9614   // r0 = input (float16)
 9615   // v0 = result (float)
 9616   // v1 = temporary float register
 9617   address generate_float16ToFloat() {
 9618     StubId stub_id = StubId::stubgen_hf2f_id;
 9619     int entry_count = StubInfo::entry_count(stub_id);
 9620     assert(entry_count == 1, "sanity check");
 9621     address start = load_archive_data(stub_id);
 9622     if (start != nullptr) {
 9623       return start;
 9624     }
 9625     __ align(CodeEntryAlignment);
 9626     StubCodeMark mark(this, stub_id);
 9627     address entry = __ pc();
 9628     BLOCK_COMMENT("Entry:");
 9629     __ flt16_to_flt(v0, r0, v1);
 9630     __ ret(lr);
 9631 
 9632     // record the stub entry and end
 9633     store_archive_data(stub_id, entry, __ pc());
 9634 
 9635     return entry;
 9636   }
 9637 
 9638   // v0 = input (float)
 9639   // r0 = result (float16)
 9640   // v1 = temporary float register
 9641   address generate_floatToFloat16() {
 9642     StubId stub_id = StubId::stubgen_f2hf_id;
 9643     int entry_count = StubInfo::entry_count(stub_id);
 9644     assert(entry_count == 1, "sanity check");
 9645     address start = load_archive_data(stub_id);
 9646     if (start != nullptr) {
 9647       return start;
 9648     }
 9649     __ align(CodeEntryAlignment);
 9650     StubCodeMark mark(this, stub_id);
 9651     address entry = __ pc();
 9652     BLOCK_COMMENT("Entry:");
 9653     __ flt_to_flt16(r0, v0, v1);
 9654     __ ret(lr);
 9655 
 9656     // record the stub entry and end
 9657     store_archive_data(stub_id, entry, __ pc());
 9658 
 9659     return entry;
 9660   }
 9661 
 9662   address generate_method_entry_barrier() {
 9663     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 9664     int entry_count = StubInfo::entry_count(stub_id);
 9665     assert(entry_count == 1, "sanity check");
 9666     address start = load_archive_data(stub_id);
 9667     if (start != nullptr) {
 9668       return start;
 9669     }
 9670     __ align(CodeEntryAlignment);
 9671     StubCodeMark mark(this, stub_id);
 9672 
 9673     Label deoptimize_label;
 9674 
 9675     start = __ pc();
 9676 
 9677     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 9678 
 9679     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 9680       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 9681       // We can get here despite the nmethod being good, if we have not
 9682       // yet applied our cross modification fence (or data fence).
 9683       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 9684       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 9685       __ ldrw(rscratch2, rscratch2);
 9686       __ strw(rscratch2, thread_epoch_addr);
 9687       __ isb();
 9688       __ membar(__ LoadLoad);
 9689     }
 9690 
 9691     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 9692 
 9693     __ enter();
 9694     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 9695 
 9696     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 9697 
 9698     __ push_call_clobbered_registers();
 9699 
 9700     __ mov(c_rarg0, rscratch2);
 9701     __ call_VM_leaf
 9702          (CAST_FROM_FN_PTR
 9703           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 9704 
 9705     __ reset_last_Java_frame(true);
 9706 
 9707     __ mov(rscratch1, r0);
 9708 
 9709     __ pop_call_clobbered_registers();
 9710 
 9711     __ cbnz(rscratch1, deoptimize_label);
 9712 
 9713     __ leave();
 9714     __ ret(lr);
 9715 
 9716     __ BIND(deoptimize_label);
 9717 
 9718     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9719     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9720 
 9721     __ mov(sp, rscratch1);
 9722     __ br(rscratch2);
 9723 
 9724     // record the stub entry and end
 9725     store_archive_data(stub_id, start, __ pc());
 9726 
 9727     return start;
 9728   }
 9729 
 9730   // r0  = result
 9731   // r1  = str1
 9732   // r2  = cnt1
 9733   // r3  = str2
 9734   // r4  = cnt2
 9735   // r10 = tmp1
 9736   // r11 = tmp2
 9737   address generate_compare_long_string_same_encoding(bool isLL) {
 9738     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9739     int entry_count = StubInfo::entry_count(stub_id);
 9740     assert(entry_count == 1, "sanity check");
 9741     address start = load_archive_data(stub_id);
 9742     if (start != nullptr) {
 9743       return start;
 9744     }
 9745     __ align(CodeEntryAlignment);
 9746     StubCodeMark mark(this, stub_id);
 9747     address entry = __ pc();
 9748     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9749         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9750 
 9751     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9752 
 9753     // exit from large loop when less than 64 bytes left to read or we're about
 9754     // to prefetch memory behind array border
 9755     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9756 
 9757     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9758     __ eor(rscratch2, tmp1, tmp2);
 9759     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9760 
 9761     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9762     // update pointers, because of previous read
 9763     __ add(str1, str1, wordSize);
 9764     __ add(str2, str2, wordSize);
 9765     if (SoftwarePrefetchHintDistance >= 0) {
 9766       __ align(OptoLoopAlignment);
 9767       __ bind(LARGE_LOOP_PREFETCH);
 9768         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9769         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9770 
 9771         for (int i = 0; i < 4; i++) {
 9772           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9773           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9774           __ cmp(tmp1, tmp2);
 9775           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9776           __ br(Assembler::NE, DIFF);
 9777         }
 9778         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9779         __ add(str1, str1, 64);
 9780         __ add(str2, str2, 64);
 9781         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9782         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9783         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9784     }
 9785 
 9786     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9787     __ br(Assembler::LE, LESS16);
 9788     __ align(OptoLoopAlignment);
 9789     __ bind(LOOP_COMPARE16);
 9790       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9791       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9792       __ cmp(tmp1, tmp2);
 9793       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9794       __ br(Assembler::NE, DIFF);
 9795       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9796       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9797       __ br(Assembler::LT, LESS16);
 9798 
 9799       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9800       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9801       __ cmp(tmp1, tmp2);
 9802       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9803       __ br(Assembler::NE, DIFF);
 9804       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9805       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9806       __ br(Assembler::GE, LOOP_COMPARE16);
 9807       __ cbz(cnt2, LENGTH_DIFF);
 9808 
 9809     __ bind(LESS16);
 9810       // each 8 compare
 9811       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9812       __ br(Assembler::LE, LESS8);
 9813       __ ldr(tmp1, Address(__ post(str1, 8)));
 9814       __ ldr(tmp2, Address(__ post(str2, 8)));
 9815       __ eor(rscratch2, tmp1, tmp2);
 9816       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9817       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9818 
 9819     __ bind(LESS8); // directly load last 8 bytes
 9820       if (!isLL) {
 9821         __ add(cnt2, cnt2, cnt2);
 9822       }
 9823       __ ldr(tmp1, Address(str1, cnt2));
 9824       __ ldr(tmp2, Address(str2, cnt2));
 9825       __ eor(rscratch2, tmp1, tmp2);
 9826       __ cbz(rscratch2, LENGTH_DIFF);
 9827       __ b(CAL_DIFFERENCE);
 9828 
 9829     __ bind(DIFF);
 9830       __ cmp(tmp1, tmp2);
 9831       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9832       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9833       // reuse rscratch2 register for the result of eor instruction
 9834       __ eor(rscratch2, tmp1, tmp2);
 9835 
 9836     __ bind(CAL_DIFFERENCE);
 9837       __ rev(rscratch2, rscratch2);
 9838       __ clz(rscratch2, rscratch2);
 9839       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9840       __ lsrv(tmp1, tmp1, rscratch2);
 9841       __ lsrv(tmp2, tmp2, rscratch2);
 9842       if (isLL) {
 9843         __ uxtbw(tmp1, tmp1);
 9844         __ uxtbw(tmp2, tmp2);
 9845       } else {
 9846         __ uxthw(tmp1, tmp1);
 9847         __ uxthw(tmp2, tmp2);
 9848       }
 9849       __ subw(result, tmp1, tmp2);
 9850 
 9851     __ bind(LENGTH_DIFF);
 9852       __ ret(lr);
 9853 
 9854     // record the stub entry and end
 9855     store_archive_data(stub_id, entry, __ pc());
 9856 
 9857     return entry;
 9858   }
 9859 
 9860   enum string_compare_mode {
 9861     LL,
 9862     LU,
 9863     UL,
 9864     UU,
 9865   };
 9866 
 9867   // The following registers are declared in aarch64.ad
 9868   // r0  = result
 9869   // r1  = str1
 9870   // r2  = cnt1
 9871   // r3  = str2
 9872   // r4  = cnt2
 9873   // r10 = tmp1
 9874   // r11 = tmp2
 9875   // z0  = ztmp1
 9876   // z1  = ztmp2
 9877   // p0  = pgtmp1
 9878   // p1  = pgtmp2
 9879   address generate_compare_long_string_sve(string_compare_mode mode) {
 9880     StubId stub_id;
 9881     switch (mode) {
 9882       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9883       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9884       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9885       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9886       default: ShouldNotReachHere();
 9887     }
 9888     int entry_count = StubInfo::entry_count(stub_id);
 9889     assert(entry_count == 1, "sanity check");
 9890     address start = load_archive_data(stub_id);
 9891     if (start != nullptr) {
 9892       return start;
 9893     }
 9894     __ align(CodeEntryAlignment);
 9895     StubCodeMark mark(this, stub_id);
 9896     address entry = __ pc();
 9897     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9898              tmp1 = r10, tmp2 = r11;
 9899 
 9900     Label LOOP, DONE, MISMATCH;
 9901     Register vec_len = tmp1;
 9902     Register idx = tmp2;
 9903     // The minimum of the string lengths has been stored in cnt2.
 9904     Register cnt = cnt2;
 9905     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9906     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9907 
 9908 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9909     switch (mode) {                                                            \
 9910       case LL:                                                                 \
 9911         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9912         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9913         break;                                                                 \
 9914       case LU:                                                                 \
 9915         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9916         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9917         break;                                                                 \
 9918       case UL:                                                                 \
 9919         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9920         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9921         break;                                                                 \
 9922       case UU:                                                                 \
 9923         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9924         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9925         break;                                                                 \
 9926       default:                                                                 \
 9927         ShouldNotReachHere();                                                  \
 9928     }
 9929 
 9930     __ mov(idx, 0);
 9931     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9932 
 9933     if (mode == LL) {
 9934       __ sve_cntb(vec_len);
 9935     } else {
 9936       __ sve_cnth(vec_len);
 9937     }
 9938 
 9939     __ sub(rscratch1, cnt, vec_len);
 9940 
 9941     __ bind(LOOP);
 9942 
 9943       // main loop
 9944       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9945       __ add(idx, idx, vec_len);
 9946       // Compare strings.
 9947       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9948       __ br(__ NE, MISMATCH);
 9949       __ cmp(idx, rscratch1);
 9950       __ br(__ LT, LOOP);
 9951 
 9952     // post loop, last iteration
 9953     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9954 
 9955     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9956     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9957     __ br(__ EQ, DONE);
 9958 
 9959     __ bind(MISMATCH);
 9960 
 9961     // Crop the vector to find its location.
 9962     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9963     // Extract the first different characters of each string.
 9964     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9965     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9966 
 9967     // Compute the difference of the first different characters.
 9968     __ sub(result, rscratch1, rscratch2);
 9969 
 9970     __ bind(DONE);
 9971     __ ret(lr);
 9972 #undef LOAD_PAIR
 9973 
 9974     // record the stub entry and end
 9975     store_archive_data(stub_id, entry, __ pc());
 9976 
 9977     return entry;
 9978   }
 9979 
 9980   void generate_compare_long_strings() {
 9981     if (UseSVE == 0) {
 9982       StubRoutines::aarch64::_compare_long_string_LL
 9983           = generate_compare_long_string_same_encoding(true);
 9984       StubRoutines::aarch64::_compare_long_string_UU
 9985           = generate_compare_long_string_same_encoding(false);
 9986       StubRoutines::aarch64::_compare_long_string_LU
 9987           = generate_compare_long_string_different_encoding(true);
 9988       StubRoutines::aarch64::_compare_long_string_UL
 9989           = generate_compare_long_string_different_encoding(false);
 9990     } else {
 9991       StubRoutines::aarch64::_compare_long_string_LL
 9992           = generate_compare_long_string_sve(LL);
 9993       StubRoutines::aarch64::_compare_long_string_UU
 9994           = generate_compare_long_string_sve(UU);
 9995       StubRoutines::aarch64::_compare_long_string_LU
 9996           = generate_compare_long_string_sve(LU);
 9997       StubRoutines::aarch64::_compare_long_string_UL
 9998           = generate_compare_long_string_sve(UL);
 9999     }
10000   }
10001 
10002   // R0 = result
10003   // R1 = str2
10004   // R2 = cnt1
10005   // R3 = str1
10006   // R4 = cnt2
10007   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
10008   //
10009   // This generic linear code use few additional ideas, which makes it faster:
10010   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
10011   // in order to skip initial loading(help in systems with 1 ld pipeline)
10012   // 2) we can use "fast" algorithm of finding single character to search for
10013   // first symbol with less branches(1 branch per each loaded register instead
10014   // of branch for each symbol), so, this is where constants like
10015   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
10016   // 3) after loading and analyzing 1st register of source string, it can be
10017   // used to search for every 1st character entry, saving few loads in
10018   // comparison with "simplier-but-slower" implementation
10019   // 4) in order to avoid lots of push/pop operations, code below is heavily
10020   // re-using/re-initializing/compressing register values, which makes code
10021   // larger and a bit less readable, however, most of extra operations are
10022   // issued during loads or branches, so, penalty is minimal
10023   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
10024     StubId stub_id;
10025     if (str1_isL) {
10026       if (str2_isL) {
10027         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
10028       } else {
10029         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
10030       }
10031     } else {
10032       if (str2_isL) {
10033         ShouldNotReachHere();
10034       } else {
10035         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
10036       }
10037     }
10038     int entry_count = StubInfo::entry_count(stub_id);
10039     assert(entry_count == 1, "sanity check");
10040     address start = load_archive_data(stub_id);
10041     if (start != nullptr) {
10042       return start;
10043     }
10044     __ align(CodeEntryAlignment);
10045     StubCodeMark mark(this, stub_id);
10046     address entry = __ pc();
10047 
10048     int str1_chr_size = str1_isL ? 1 : 2;
10049     int str2_chr_size = str2_isL ? 1 : 2;
10050     int str1_chr_shift = str1_isL ? 0 : 1;
10051     int str2_chr_shift = str2_isL ? 0 : 1;
10052     bool isL = str1_isL && str2_isL;
10053    // parameters
10054     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
10055     // temporary registers
10056     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
10057     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
10058     // redefinitions
10059     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
10060 
10061     __ push(spilled_regs, sp);
10062     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
10063         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
10064         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
10065         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
10066         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
10067         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
10068     // Read whole register from str1. It is safe, because length >=8 here
10069     __ ldr(ch1, Address(str1));
10070     // Read whole register from str2. It is safe, because length >=8 here
10071     __ ldr(ch2, Address(str2));
10072     __ sub(cnt2, cnt2, cnt1);
10073     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
10074     if (str1_isL != str2_isL) {
10075       __ eor(v0, __ T16B, v0, v0);
10076     }
10077     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
10078     __ mul(first, first, tmp1);
10079     // check if we have less than 1 register to check
10080     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
10081     if (str1_isL != str2_isL) {
10082       __ fmovd(v1, ch1);
10083     }
10084     __ br(__ LE, L_SMALL);
10085     __ eor(ch2, first, ch2);
10086     if (str1_isL != str2_isL) {
10087       __ zip1(v1, __ T16B, v1, v0);
10088     }
10089     __ sub(tmp2, ch2, tmp1);
10090     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10091     __ bics(tmp2, tmp2, ch2);
10092     if (str1_isL != str2_isL) {
10093       __ fmovd(ch1, v1);
10094     }
10095     __ br(__ NE, L_HAS_ZERO);
10096     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10097     __ add(result, result, wordSize/str2_chr_size);
10098     __ add(str2, str2, wordSize);
10099     __ br(__ LT, L_POST_LOOP);
10100     __ BIND(L_LOOP);
10101       __ ldr(ch2, Address(str2));
10102       __ eor(ch2, first, ch2);
10103       __ sub(tmp2, ch2, tmp1);
10104       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10105       __ bics(tmp2, tmp2, ch2);
10106       __ br(__ NE, L_HAS_ZERO);
10107     __ BIND(L_LOOP_PROCEED);
10108       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10109       __ add(str2, str2, wordSize);
10110       __ add(result, result, wordSize/str2_chr_size);
10111       __ br(__ GE, L_LOOP);
10112     __ BIND(L_POST_LOOP);
10113       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
10114       __ br(__ LE, NOMATCH);
10115       __ ldr(ch2, Address(str2));
10116       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10117       __ eor(ch2, first, ch2);
10118       __ sub(tmp2, ch2, tmp1);
10119       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10120       __ mov(tmp4, -1); // all bits set
10121       __ b(L_SMALL_PROCEED);
10122     __ align(OptoLoopAlignment);
10123     __ BIND(L_SMALL);
10124       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10125       __ eor(ch2, first, ch2);
10126       if (str1_isL != str2_isL) {
10127         __ zip1(v1, __ T16B, v1, v0);
10128       }
10129       __ sub(tmp2, ch2, tmp1);
10130       __ mov(tmp4, -1); // all bits set
10131       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10132       if (str1_isL != str2_isL) {
10133         __ fmovd(ch1, v1); // move converted 4 symbols
10134       }
10135     __ BIND(L_SMALL_PROCEED);
10136       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
10137       __ bic(tmp2, tmp2, ch2);
10138       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
10139       __ rbit(tmp2, tmp2);
10140       __ br(__ EQ, NOMATCH);
10141     __ BIND(L_SMALL_HAS_ZERO_LOOP);
10142       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
10143       __ cmp(cnt1, u1(wordSize/str2_chr_size));
10144       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
10145       if (str2_isL) { // LL
10146         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10147         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10148         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10149         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10150         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10151       } else {
10152         __ mov(ch2, 0xE); // all bits in byte set except last one
10153         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10154         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10155         __ lslv(tmp2, tmp2, tmp4);
10156         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10157         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10158         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10159         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10160       }
10161       __ cmp(ch1, ch2);
10162       __ mov(tmp4, wordSize/str2_chr_size);
10163       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10164     __ BIND(L_SMALL_CMP_LOOP);
10165       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10166                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10167       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10168                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10169       __ add(tmp4, tmp4, 1);
10170       __ cmp(tmp4, cnt1);
10171       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
10172       __ cmp(first, ch2);
10173       __ br(__ EQ, L_SMALL_CMP_LOOP);
10174     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
10175       __ cbz(tmp2, NOMATCH); // no more matches. exit
10176       __ clz(tmp4, tmp2);
10177       __ add(result, result, 1); // advance index
10178       __ add(str2, str2, str2_chr_size); // advance pointer
10179       __ b(L_SMALL_HAS_ZERO_LOOP);
10180     __ align(OptoLoopAlignment);
10181     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
10182       __ cmp(first, ch2);
10183       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10184       __ b(DONE);
10185     __ align(OptoLoopAlignment);
10186     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
10187       if (str2_isL) { // LL
10188         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10189         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10190         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10191         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10192         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10193       } else {
10194         __ mov(ch2, 0xE); // all bits in byte set except last one
10195         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10196         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10197         __ lslv(tmp2, tmp2, tmp4);
10198         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10199         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10200         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10201         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10202       }
10203       __ cmp(ch1, ch2);
10204       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10205       __ b(DONE);
10206     __ align(OptoLoopAlignment);
10207     __ BIND(L_HAS_ZERO);
10208       __ rbit(tmp2, tmp2);
10209       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
10210       // Now, perform compression of counters(cnt2 and cnt1) into one register.
10211       // It's fine because both counters are 32bit and are not changed in this
10212       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
10213       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
10214       __ sub(result, result, 1);
10215     __ BIND(L_HAS_ZERO_LOOP);
10216       __ mov(cnt1, wordSize/str2_chr_size);
10217       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10218       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
10219       if (str2_isL) {
10220         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10221         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10222         __ lslv(tmp2, tmp2, tmp4);
10223         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10224         __ add(tmp4, tmp4, 1);
10225         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10226         __ lsl(tmp2, tmp2, 1);
10227         __ mov(tmp4, wordSize/str2_chr_size);
10228       } else {
10229         __ mov(ch2, 0xE);
10230         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10231         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10232         __ lslv(tmp2, tmp2, tmp4);
10233         __ add(tmp4, tmp4, 1);
10234         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10235         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10236         __ lsl(tmp2, tmp2, 1);
10237         __ mov(tmp4, wordSize/str2_chr_size);
10238         __ sub(str2, str2, str2_chr_size);
10239       }
10240       __ cmp(ch1, ch2);
10241       __ mov(tmp4, wordSize/str2_chr_size);
10242       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10243     __ BIND(L_CMP_LOOP);
10244       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10245                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10246       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10247                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10248       __ add(tmp4, tmp4, 1);
10249       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10250       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
10251       __ cmp(cnt1, ch2);
10252       __ br(__ EQ, L_CMP_LOOP);
10253     __ BIND(L_CMP_LOOP_NOMATCH);
10254       // here we're not matched
10255       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
10256       __ clz(tmp4, tmp2);
10257       __ add(str2, str2, str2_chr_size); // advance pointer
10258       __ b(L_HAS_ZERO_LOOP);
10259     __ align(OptoLoopAlignment);
10260     __ BIND(L_CMP_LOOP_LAST_CMP);
10261       __ cmp(cnt1, ch2);
10262       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10263       __ b(DONE);
10264     __ align(OptoLoopAlignment);
10265     __ BIND(L_CMP_LOOP_LAST_CMP2);
10266       if (str2_isL) {
10267         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10268         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10269         __ lslv(tmp2, tmp2, tmp4);
10270         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10271         __ add(tmp4, tmp4, 1);
10272         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10273         __ lsl(tmp2, tmp2, 1);
10274       } else {
10275         __ mov(ch2, 0xE);
10276         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10277         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10278         __ lslv(tmp2, tmp2, tmp4);
10279         __ add(tmp4, tmp4, 1);
10280         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10281         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10282         __ lsl(tmp2, tmp2, 1);
10283         __ sub(str2, str2, str2_chr_size);
10284       }
10285       __ cmp(ch1, ch2);
10286       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10287       __ b(DONE);
10288     __ align(OptoLoopAlignment);
10289     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
10290       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
10291       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
10292       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
10293       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
10294       // result by analyzed characters value, so, we can just reset lower bits
10295       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
10296       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
10297       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
10298       // index of last analyzed substring inside current octet. So, str2 in at
10299       // respective start address. We need to advance it to next octet
10300       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
10301       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
10302       __ bfm(result, zr, 0, 2 - str2_chr_shift);
10303       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
10304       __ movw(cnt2, cnt2);
10305       __ b(L_LOOP_PROCEED);
10306     __ align(OptoLoopAlignment);
10307     __ BIND(NOMATCH);
10308       __ mov(result, -1);
10309     __ BIND(DONE);
10310       __ pop(spilled_regs, sp);
10311       __ ret(lr);
10312 
10313     // record the stub entry and end
10314     store_archive_data(stub_id, entry, __ pc());
10315 
10316     return entry;
10317   }
10318 
10319   void generate_string_indexof_stubs() {
10320     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
10321     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
10322     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
10323   }
10324 
10325   void inflate_and_store_2_fp_registers(bool generatePrfm,
10326       FloatRegister src1, FloatRegister src2) {
10327     Register dst = r1;
10328     __ zip1(v1, __ T16B, src1, v0);
10329     __ zip2(v2, __ T16B, src1, v0);
10330     if (generatePrfm) {
10331       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
10332     }
10333     __ zip1(v3, __ T16B, src2, v0);
10334     __ zip2(v4, __ T16B, src2, v0);
10335     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
10336   }
10337 
10338   // R0 = src
10339   // R1 = dst
10340   // R2 = len
10341   // R3 = len >> 3
10342   // V0 = 0
10343   // v1 = loaded 8 bytes
10344   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
10345   address generate_large_byte_array_inflate() {
10346     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
10347     int entry_count = StubInfo::entry_count(stub_id);
10348     assert(entry_count == 1, "sanity check");
10349     address start = load_archive_data(stub_id);
10350     if (start != nullptr) {
10351       return start;
10352     }
10353     __ align(CodeEntryAlignment);
10354     StubCodeMark mark(this, stub_id);
10355     address entry = __ pc();
10356     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
10357     Register src = r0, dst = r1, len = r2, octetCounter = r3;
10358     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
10359 
10360     // do one more 8-byte read to have address 16-byte aligned in most cases
10361     // also use single store instruction
10362     __ ldrd(v2, __ post(src, 8));
10363     __ sub(octetCounter, octetCounter, 2);
10364     __ zip1(v1, __ T16B, v1, v0);
10365     __ zip1(v2, __ T16B, v2, v0);
10366     __ st1(v1, v2, __ T16B, __ post(dst, 32));
10367     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10368     __ subs(rscratch1, octetCounter, large_loop_threshold);
10369     __ br(__ LE, LOOP_START);
10370     __ b(LOOP_PRFM_START);
10371     __ bind(LOOP_PRFM);
10372       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10373     __ bind(LOOP_PRFM_START);
10374       __ prfm(Address(src, SoftwarePrefetchHintDistance));
10375       __ sub(octetCounter, octetCounter, 8);
10376       __ subs(rscratch1, octetCounter, large_loop_threshold);
10377       inflate_and_store_2_fp_registers(true, v3, v4);
10378       inflate_and_store_2_fp_registers(true, v5, v6);
10379       __ br(__ GT, LOOP_PRFM);
10380       __ cmp(octetCounter, (u1)8);
10381       __ br(__ LT, DONE);
10382     __ bind(LOOP);
10383       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10384       __ bind(LOOP_START);
10385       __ sub(octetCounter, octetCounter, 8);
10386       __ cmp(octetCounter, (u1)8);
10387       inflate_and_store_2_fp_registers(false, v3, v4);
10388       inflate_and_store_2_fp_registers(false, v5, v6);
10389       __ br(__ GE, LOOP);
10390     __ bind(DONE);
10391       __ ret(lr);
10392 
10393     // record the stub entry and end
10394     store_archive_data(stub_id, entry, __ pc());
10395 
10396     return entry;
10397   }
10398 
10399   /**
10400    *  Arguments:
10401    *
10402    *  Input:
10403    *  c_rarg0   - current state address
10404    *  c_rarg1   - H key address
10405    *  c_rarg2   - data address
10406    *  c_rarg3   - number of blocks
10407    *
10408    *  Output:
10409    *  Updated state at c_rarg0
10410    */
10411   address generate_ghash_processBlocks_small() {
10412     // Bafflingly, GCM uses little-endian for the byte order, but
10413     // big-endian for the bit order.  For example, the polynomial 1 is
10414     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
10415     //
10416     // So, we must either reverse the bytes in each word and do
10417     // everything big-endian or reverse the bits in each byte and do
10418     // it little-endian.  On AArch64 it's more idiomatic to reverse
10419     // the bits in each byte (we have an instruction, RBIT, to do
10420     // that) and keep the data in little-endian bit order through the
10421     // calculation, bit-reversing the inputs and outputs.
10422 
10423     StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
10424     int entry_count = StubInfo::entry_count(stub_id);
10425     assert(entry_count == 1, "sanity check");
10426     address start = load_archive_data(stub_id);
10427     if (start != nullptr) {
10428       return start;
10429     }
10430     __ align(CodeEntryAlignment);
10431     StubCodeMark mark(this, stub_id);
10432     Label polynomial; // local data generated at end of stub
10433     start = __ pc();
10434 
10435     Register state   = c_rarg0;
10436     Register subkeyH = c_rarg1;
10437     Register data    = c_rarg2;
10438     Register blocks  = c_rarg3;
10439 
10440     FloatRegister vzr = v30;
10441     __ eor(vzr, __ T16B, vzr, vzr); // zero register
10442 
10443     __ adr(rscratch1, polynomial);
10444     __ ldrq(v24, rscratch1);    // The field polynomial
10445 
10446     __ ldrq(v0, Address(state));
10447     __ ldrq(v1, Address(subkeyH));
10448 
10449     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
10450     __ rbit(v0, __ T16B, v0);
10451     __ rev64(v1, __ T16B, v1);
10452     __ rbit(v1, __ T16B, v1);
10453 
10454     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
10455     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
10456 
10457     {
10458       Label L_ghash_loop;
10459       __ bind(L_ghash_loop);
10460 
10461       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
10462                                                  // reversing each byte
10463       __ rbit(v2, __ T16B, v2);
10464       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
10465 
10466       // Multiply state in v2 by subkey in v1
10467       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
10468                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
10469                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
10470       // Reduce v7:v5 by the field polynomial
10471       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
10472 
10473       __ sub(blocks, blocks, 1);
10474       __ cbnz(blocks, L_ghash_loop);
10475     }
10476 
10477     // The bit-reversed result is at this point in v0
10478     __ rev64(v0, __ T16B, v0);
10479     __ rbit(v0, __ T16B, v0);
10480 
10481     __ st1(v0, __ T16B, state);
10482     __ ret(lr);
10483 
10484     // bind label and generate local polynomial data
10485     __ align(wordSize * 2);
10486     __ bind(polynomial);
10487     __ emit_int64(0x87);  // The low-order bits of the field
10488                           // polynomial (i.e. p = z^7+z^2+z+1)
10489                           // repeated in the low and high parts of a
10490                           // 128-bit vector
10491     __ emit_int64(0x87);
10492 
10493     // record the stub entry and end
10494     store_archive_data(stub_id, start, __ pc());
10495 
10496     return start;
10497   }
10498 
10499   address generate_ghash_processBlocks(address small) {
10500     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
10501     int entry_count = StubInfo::entry_count(stub_id);
10502     assert(entry_count == 1, "sanity check");
10503     address start = load_archive_data(stub_id);
10504     if (start != nullptr) {
10505       return start;
10506     }
10507     Label polynomial;           // local data generated after stub
10508     __ align(CodeEntryAlignment);
10509     StubCodeMark mark(this, stub_id);
10510     start = __ pc();
10511 
10512     Register state   = c_rarg0;
10513     Register subkeyH = c_rarg1;
10514     Register data    = c_rarg2;
10515     Register blocks  = c_rarg3;
10516 
10517     const int unroll = 4;
10518 
10519     __ cmp(blocks, (unsigned char)(unroll * 2));
10520     __ br(__ LT, small);
10521 
10522     if (unroll > 1) {
10523     // Save state before entering routine
10524       __ sub(sp, sp, 4 * 16);
10525       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
10526       __ sub(sp, sp, 4 * 16);
10527       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
10528     }
10529 
10530     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
10531 
10532     if (unroll > 1) {
10533       // And restore state
10534       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
10535       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
10536     }
10537 
10538     __ cmp(blocks, (unsigned char)0);
10539     __ br(__ GT, small);
10540 
10541     __ ret(lr);
10542 
10543     // bind label and generate polynomial data
10544     __ align(wordSize * 2);
10545     __ bind(polynomial);
10546     __ emit_int64(0x87);  // The low-order bits of the field
10547                           // polynomial (i.e. p = z^7+z^2+z+1)
10548                           // repeated in the low and high parts of a
10549                           // 128-bit vector
10550     __ emit_int64(0x87);
10551 
10552     // record the stub entry and end
10553     store_archive_data(stub_id, start, __ pc());
10554 
10555     return start;
10556   }
10557 
10558   void generate_base64_encode_simdround(Register src, Register dst,
10559         FloatRegister codec, u8 size) {
10560 
10561     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
10562     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
10563     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
10564 
10565     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10566 
10567     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
10568 
10569     __ ushr(ind0, arrangement, in0,  2);
10570 
10571     __ ushr(ind1, arrangement, in1,  2);
10572     __ shl(in0,   arrangement, in0,  6);
10573     __ orr(ind1,  arrangement, ind1, in0);
10574     __ ushr(ind1, arrangement, ind1, 2);
10575 
10576     __ ushr(ind2, arrangement, in2,  4);
10577     __ shl(in1,   arrangement, in1,  4);
10578     __ orr(ind2,  arrangement, in1,  ind2);
10579     __ ushr(ind2, arrangement, ind2, 2);
10580 
10581     __ shl(ind3,  arrangement, in2,  2);
10582     __ ushr(ind3, arrangement, ind3, 2);
10583 
10584     __ tbl(out0,  arrangement, codec,  4, ind0);
10585     __ tbl(out1,  arrangement, codec,  4, ind1);
10586     __ tbl(out2,  arrangement, codec,  4, ind2);
10587     __ tbl(out3,  arrangement, codec,  4, ind3);
10588 
10589     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
10590   }
10591 
10592    /**
10593    *  Arguments:
10594    *
10595    *  Input:
10596    *  c_rarg0   - src_start
10597    *  c_rarg1   - src_offset
10598    *  c_rarg2   - src_length
10599    *  c_rarg3   - dest_start
10600    *  c_rarg4   - dest_offset
10601    *  c_rarg5   - isURL
10602    *
10603    */
10604   address generate_base64_encodeBlock() {
10605 
10606     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
10607     int entry_count = StubInfo::entry_count(stub_id);
10608     assert(entry_count == 1, "sanity check");
10609     address start = load_archive_data(stub_id);
10610     if (start != nullptr) {
10611       return start;
10612     }
10613     __ align(CodeEntryAlignment);
10614     StubCodeMark mark(this, stub_id);
10615     start = __ pc();
10616 
10617     Register src   = c_rarg0;  // source array
10618     Register soff  = c_rarg1;  // source start offset
10619     Register send  = c_rarg2;  // source end offset
10620     Register dst   = c_rarg3;  // dest array
10621     Register doff  = c_rarg4;  // position for writing to dest array
10622     Register isURL = c_rarg5;  // Base64 or URL character set
10623 
10624     // c_rarg6 and c_rarg7 are free to use as temps
10625     Register codec  = c_rarg6;
10626     Register length = c_rarg7;
10627 
10628     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
10629 
10630     __ add(src, src, soff);
10631     __ add(dst, dst, doff);
10632     __ sub(length, send, soff);
10633 
10634     // load the codec base address
10635     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
10636     __ cbz(isURL, ProcessData);
10637     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
10638 
10639     __ BIND(ProcessData);
10640 
10641     // too short to formup a SIMD loop, roll back
10642     __ cmp(length, (u1)24);
10643     __ br(Assembler::LT, Process3B);
10644 
10645     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
10646 
10647     __ BIND(Process48B);
10648     __ cmp(length, (u1)48);
10649     __ br(Assembler::LT, Process24B);
10650     generate_base64_encode_simdround(src, dst, v0, 16);
10651     __ sub(length, length, 48);
10652     __ b(Process48B);
10653 
10654     __ BIND(Process24B);
10655     __ cmp(length, (u1)24);
10656     __ br(Assembler::LT, SIMDExit);
10657     generate_base64_encode_simdround(src, dst, v0, 8);
10658     __ sub(length, length, 24);
10659 
10660     __ BIND(SIMDExit);
10661     __ cbz(length, Exit);
10662 
10663     __ BIND(Process3B);
10664     //  3 src bytes, 24 bits
10665     __ ldrb(r10, __ post(src, 1));
10666     __ ldrb(r11, __ post(src, 1));
10667     __ ldrb(r12, __ post(src, 1));
10668     __ orrw(r11, r11, r10, Assembler::LSL, 8);
10669     __ orrw(r12, r12, r11, Assembler::LSL, 8);
10670     // codec index
10671     __ ubfmw(r15, r12, 18, 23);
10672     __ ubfmw(r14, r12, 12, 17);
10673     __ ubfmw(r13, r12, 6,  11);
10674     __ andw(r12,  r12, 63);
10675     // get the code based on the codec
10676     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
10677     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
10678     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
10679     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
10680     __ strb(r15, __ post(dst, 1));
10681     __ strb(r14, __ post(dst, 1));
10682     __ strb(r13, __ post(dst, 1));
10683     __ strb(r12, __ post(dst, 1));
10684     __ sub(length, length, 3);
10685     __ cbnz(length, Process3B);
10686 
10687     __ BIND(Exit);
10688     __ ret(lr);
10689 
10690     // record the stub entry and end
10691     store_archive_data(stub_id, start, __ pc());
10692 
10693     return start;
10694   }
10695 
10696   void generate_base64_decode_simdround(Register src, Register dst,
10697         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
10698 
10699     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
10700     FloatRegister out0 = v20, out1 = v21, out2 = v22;
10701 
10702     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
10703     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
10704 
10705     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
10706 
10707     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10708 
10709     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10710 
10711     // we need unsigned saturating subtract, to make sure all input values
10712     // in range [0, 63] will have 0U value in the higher half lookup
10713     __ uqsubv(decH0, __ T16B, in0, v27);
10714     __ uqsubv(decH1, __ T16B, in1, v27);
10715     __ uqsubv(decH2, __ T16B, in2, v27);
10716     __ uqsubv(decH3, __ T16B, in3, v27);
10717 
10718     // lower half lookup
10719     __ tbl(decL0, arrangement, codecL, 4, in0);
10720     __ tbl(decL1, arrangement, codecL, 4, in1);
10721     __ tbl(decL2, arrangement, codecL, 4, in2);
10722     __ tbl(decL3, arrangement, codecL, 4, in3);
10723 
10724     // higher half lookup
10725     __ tbx(decH0, arrangement, codecH, 4, decH0);
10726     __ tbx(decH1, arrangement, codecH, 4, decH1);
10727     __ tbx(decH2, arrangement, codecH, 4, decH2);
10728     __ tbx(decH3, arrangement, codecH, 4, decH3);
10729 
10730     // combine lower and higher
10731     __ orr(decL0, arrangement, decL0, decH0);
10732     __ orr(decL1, arrangement, decL1, decH1);
10733     __ orr(decL2, arrangement, decL2, decH2);
10734     __ orr(decL3, arrangement, decL3, decH3);
10735 
10736     // check illegal inputs, value larger than 63 (maximum of 6 bits)
10737     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10738     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10739     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10740     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10741     __ orr(in0, arrangement, decH0, decH1);
10742     __ orr(in1, arrangement, decH2, decH3);
10743     __ orr(in2, arrangement, in0,   in1);
10744     __ umaxv(in3, arrangement, in2);
10745     __ umov(rscratch2, in3, __ B, 0);
10746 
10747     // get the data to output
10748     __ shl(out0,  arrangement, decL0, 2);
10749     __ ushr(out1, arrangement, decL1, 4);
10750     __ orr(out0,  arrangement, out0,  out1);
10751     __ shl(out1,  arrangement, decL1, 4);
10752     __ ushr(out2, arrangement, decL2, 2);
10753     __ orr(out1,  arrangement, out1,  out2);
10754     __ shl(out2,  arrangement, decL2, 6);
10755     __ orr(out2,  arrangement, out2,  decL3);
10756 
10757     __ cbz(rscratch2, NoIllegalData);
10758 
10759     // handle illegal input
10760     __ umov(r10, in2, __ D, 0);
10761     if (size == 16) {
10762       __ cbnz(r10, ErrorInLowerHalf);
10763 
10764       // illegal input is in higher half, store the lower half now.
10765       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10766 
10767       __ umov(r10, in2,  __ D, 1);
10768       __ umov(r11, out0, __ D, 1);
10769       __ umov(r12, out1, __ D, 1);
10770       __ umov(r13, out2, __ D, 1);
10771       __ b(StoreLegalData);
10772 
10773       __ BIND(ErrorInLowerHalf);
10774     }
10775     __ umov(r11, out0, __ D, 0);
10776     __ umov(r12, out1, __ D, 0);
10777     __ umov(r13, out2, __ D, 0);
10778 
10779     __ BIND(StoreLegalData);
10780     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10781     __ strb(r11, __ post(dst, 1));
10782     __ strb(r12, __ post(dst, 1));
10783     __ strb(r13, __ post(dst, 1));
10784     __ lsr(r10, r10, 8);
10785     __ lsr(r11, r11, 8);
10786     __ lsr(r12, r12, 8);
10787     __ lsr(r13, r13, 8);
10788     __ b(StoreLegalData);
10789 
10790     __ BIND(NoIllegalData);
10791     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10792   }
10793 
10794 
10795    /**
10796    *  Arguments:
10797    *
10798    *  Input:
10799    *  c_rarg0   - src_start
10800    *  c_rarg1   - src_offset
10801    *  c_rarg2   - src_length
10802    *  c_rarg3   - dest_start
10803    *  c_rarg4   - dest_offset
10804    *  c_rarg5   - isURL
10805    *  c_rarg6   - isMIME
10806    *
10807    */
10808   address generate_base64_decodeBlock() {
10809 
10810     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10811     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10812     // titled "Base64 decoding".
10813 
10814     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10815     int entry_count = StubInfo::entry_count(stub_id);
10816     assert(entry_count == 1, "sanity check");
10817     address start = load_archive_data(stub_id);
10818     if (start != nullptr) {
10819       return start;
10820     }
10821     __ align(CodeEntryAlignment);
10822     StubCodeMark mark(this, stub_id);
10823     start = __ pc();
10824 
10825     Register src    = c_rarg0;  // source array
10826     Register soff   = c_rarg1;  // source start offset
10827     Register send   = c_rarg2;  // source end offset
10828     Register dst    = c_rarg3;  // dest array
10829     Register doff   = c_rarg4;  // position for writing to dest array
10830     Register isURL  = c_rarg5;  // Base64 or URL character set
10831     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10832 
10833     Register length = send;    // reuse send as length of source data to process
10834 
10835     Register simd_codec   = c_rarg6;
10836     Register nosimd_codec = c_rarg7;
10837 
10838     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10839 
10840     __ enter();
10841 
10842     __ add(src, src, soff);
10843     __ add(dst, dst, doff);
10844 
10845     __ mov(doff, dst);
10846 
10847     __ sub(length, send, soff);
10848     __ bfm(length, zr, 0, 1);
10849 
10850     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
10851     __ cbz(isURL, ProcessData);
10852     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
10853 
10854     __ BIND(ProcessData);
10855     __ mov(rscratch1, length);
10856     __ cmp(length, (u1)144); // 144 = 80 + 64
10857     __ br(Assembler::LT, Process4B);
10858 
10859     // In the MIME case, the line length cannot be more than 76
10860     // bytes (see RFC 2045). This is too short a block for SIMD
10861     // to be worthwhile, so we use non-SIMD here.
10862     __ movw(rscratch1, 79);
10863 
10864     __ BIND(Process4B);
10865     __ ldrw(r14, __ post(src, 4));
10866     __ ubfxw(r10, r14, 0,  8);
10867     __ ubfxw(r11, r14, 8,  8);
10868     __ ubfxw(r12, r14, 16, 8);
10869     __ ubfxw(r13, r14, 24, 8);
10870     // get the de-code
10871     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10872     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10873     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10874     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10875     // error detection, 255u indicates an illegal input
10876     __ orrw(r14, r10, r11);
10877     __ orrw(r15, r12, r13);
10878     __ orrw(r14, r14, r15);
10879     __ tbnz(r14, 7, Exit);
10880     // recover the data
10881     __ lslw(r14, r10, 10);
10882     __ bfiw(r14, r11, 4, 6);
10883     __ bfmw(r14, r12, 2, 5);
10884     __ rev16w(r14, r14);
10885     __ bfiw(r13, r12, 6, 2);
10886     __ strh(r14, __ post(dst, 2));
10887     __ strb(r13, __ post(dst, 1));
10888     // non-simd loop
10889     __ subsw(rscratch1, rscratch1, 4);
10890     __ br(Assembler::GT, Process4B);
10891 
10892     // if exiting from PreProcess80B, rscratch1 == -1;
10893     // otherwise, rscratch1 == 0.
10894     __ cbzw(rscratch1, Exit);
10895     __ sub(length, length, 80);
10896 
10897     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
10898     __ cbz(isURL, SIMDEnter);
10899     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
10900 
10901     __ BIND(SIMDEnter);
10902     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10903     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10904     __ mov(rscratch1, 63);
10905     __ dup(v27, __ T16B, rscratch1);
10906 
10907     __ BIND(Process64B);
10908     __ cmp(length, (u1)64);
10909     __ br(Assembler::LT, Process32B);
10910     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10911     __ sub(length, length, 64);
10912     __ b(Process64B);
10913 
10914     __ BIND(Process32B);
10915     __ cmp(length, (u1)32);
10916     __ br(Assembler::LT, SIMDExit);
10917     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10918     __ sub(length, length, 32);
10919     __ b(Process32B);
10920 
10921     __ BIND(SIMDExit);
10922     __ cbz(length, Exit);
10923     __ movw(rscratch1, length);
10924     __ b(Process4B);
10925 
10926     __ BIND(Exit);
10927     __ sub(c_rarg0, dst, doff);
10928 
10929     __ leave();
10930     __ ret(lr);
10931 
10932     // record the stub entry and end
10933     store_archive_data(stub_id, start, __ pc());
10934 
10935     return start;
10936   }
10937 
10938   // Support for spin waits.
10939   address generate_spin_wait() {
10940     StubId stub_id = StubId::stubgen_spin_wait_id;
10941     int entry_count = StubInfo::entry_count(stub_id);
10942     assert(entry_count == 1, "sanity check");
10943     address start = load_archive_data(stub_id);
10944     if (start != nullptr) {
10945       return start;
10946     }
10947     __ align(CodeEntryAlignment);
10948     StubCodeMark mark(this, stub_id);
10949     start = __ pc();
10950 
10951     __ spin_wait();
10952     __ ret(lr);
10953 
10954     // record the stub entry and end
10955     store_archive_data(stub_id, start, __ pc());
10956 
10957     return start;
10958   }
10959 
10960   void generate_lookup_secondary_supers_table_stub() {
10961     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10962     GrowableArray<address> entries;
10963     int entry_count = StubInfo::entry_count(stub_id);
10964     assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
10965     address start = load_archive_data(stub_id, &entries);
10966     if (start != nullptr) {
10967       assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
10968              "unexpected extra entry count %d", entries.length());
10969       StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
10970       for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10971         StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
10972       }
10973       return;
10974     }
10975 
10976     StubCodeMark mark(this, stub_id);
10977 
10978     const Register
10979       r_super_klass  = r0,
10980       r_array_base   = r1,
10981       r_array_length = r2,
10982       r_array_index  = r3,
10983       r_sub_klass    = r4,
10984       r_bitmap       = rscratch2,
10985       result         = r5;
10986     const FloatRegister
10987       vtemp          = v0;
10988 
10989     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10990       address next_entry = __ pc();
10991       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
10992       if (slot == 0) {
10993         start = next_entry;
10994       } else {
10995         entries.append(next_entry);
10996       }
10997       Label L_success;
10998       __ enter();
10999       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
11000                                              r_array_base, r_array_length, r_array_index,
11001                                              vtemp, result, slot,
11002                                              /*stub_is_near*/true);
11003       __ leave();
11004       __ ret(lr);
11005     }
11006     // record the stub entry and end plus all the auxiliary entries
11007     store_archive_data(stub_id, start, __ pc(), &entries);
11008   }
11009 
11010   // Slow path implementation for UseSecondarySupersTable.
11011   address generate_lookup_secondary_supers_table_slow_path_stub() {
11012     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
11013     int entry_count = StubInfo::entry_count(stub_id);
11014     assert(entry_count == 1, "sanity check");
11015     address start = load_archive_data(stub_id);
11016     if (start != nullptr) {
11017       return start;
11018     }
11019     StubCodeMark mark(this, stub_id);
11020     start = __ pc();
11021     const Register
11022       r_super_klass  = r0,        // argument
11023       r_array_base   = r1,        // argument
11024       temp1          = r2,        // temp
11025       r_array_index  = r3,        // argument
11026       r_bitmap       = rscratch2, // argument
11027       result         = r5;        // argument
11028 
11029     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
11030     __ ret(lr);
11031 
11032     // record the stub entry and end
11033     store_archive_data(stub_id, start, __ pc());
11034 
11035     return start;
11036   }
11037 
11038 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11039 
11040   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
11041   //
11042   // If LSE is in use, generate LSE versions of all the stubs. The
11043   // non-LSE versions are in atomic_aarch64.S.
11044 
11045   // class AtomicStubMark records the entry point of a stub and the
11046   // stub pointer which will point to it. The stub pointer is set to
11047   // the entry point when ~AtomicStubMark() is called, which must be
11048   // after ICache::invalidate_range. This ensures safe publication of
11049   // the generated code.
11050   class AtomicStubMark {
11051     address _entry_point;
11052     aarch64_atomic_stub_t *_stub;
11053     MacroAssembler *_masm;
11054   public:
11055     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
11056       _masm = masm;
11057       __ align(32);
11058       _entry_point = __ pc();
11059       _stub = stub;
11060     }
11061     ~AtomicStubMark() {
11062       *_stub = (aarch64_atomic_stub_t)_entry_point;
11063     }
11064   };
11065 
11066   // NB: For memory_order_conservative we need a trailing membar after
11067   // LSE atomic operations but not a leading membar.
11068   //
11069   // We don't need a leading membar because a clause in the Arm ARM
11070   // says:
11071   //
11072   //   Barrier-ordered-before
11073   //
11074   //   Barrier instructions order prior Memory effects before subsequent
11075   //   Memory effects generated by the same Observer. A read or a write
11076   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
11077   //   Observer if and only if RW1 appears in program order before RW 2
11078   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
11079   //   instruction with both Acquire and Release semantics.
11080   //
11081   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
11082   // and Release semantics, therefore we don't need a leading
11083   // barrier. However, there is no corresponding Barrier-ordered-after
11084   // relationship, therefore we need a trailing membar to prevent a
11085   // later store or load from being reordered with the store in an
11086   // atomic instruction.
11087   //
11088   // This was checked by using the herd7 consistency model simulator
11089   // (http://diy.inria.fr/) with this test case:
11090   //
11091   // AArch64 LseCas
11092   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
11093   // P0 | P1;
11094   // LDR W4, [X2] | MOV W3, #0;
11095   // DMB LD       | MOV W4, #1;
11096   // LDR W3, [X1] | CASAL W3, W4, [X1];
11097   //              | DMB ISH;
11098   //              | STR W4, [X2];
11099   // exists
11100   // (0:X3=0 /\ 0:X4=1)
11101   //
11102   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
11103   // with the store to x in P1. Without the DMB in P1 this may happen.
11104   //
11105   // At the time of writing we don't know of any AArch64 hardware that
11106   // reorders stores in this way, but the Reference Manual permits it.
11107 
11108   void gen_cas_entry(Assembler::operand_size size,
11109                      atomic_memory_order order) {
11110     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
11111       exchange_val = c_rarg2;
11112     bool acquire, release;
11113     switch (order) {
11114       case memory_order_relaxed:
11115         acquire = false;
11116         release = false;
11117         break;
11118       case memory_order_release:
11119         acquire = false;
11120         release = true;
11121         break;
11122       default:
11123         acquire = true;
11124         release = true;
11125         break;
11126     }
11127     __ mov(prev, compare_val);
11128     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
11129     if (order == memory_order_conservative) {
11130       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11131     }
11132     if (size == Assembler::xword) {
11133       __ mov(r0, prev);
11134     } else {
11135       __ movw(r0, prev);
11136     }
11137     __ ret(lr);
11138   }
11139 
11140   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
11141     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11142     // If not relaxed, then default to conservative.  Relaxed is the only
11143     // case we use enough to be worth specializing.
11144     if (order == memory_order_relaxed) {
11145       __ ldadd(size, incr, prev, addr);
11146     } else {
11147       __ ldaddal(size, incr, prev, addr);
11148       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11149     }
11150     if (size == Assembler::xword) {
11151       __ mov(r0, prev);
11152     } else {
11153       __ movw(r0, prev);
11154     }
11155     __ ret(lr);
11156   }
11157 
11158   void gen_swpal_entry(Assembler::operand_size size) {
11159     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11160     __ swpal(size, incr, prev, addr);
11161     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11162     if (size == Assembler::xword) {
11163       __ mov(r0, prev);
11164     } else {
11165       __ movw(r0, prev);
11166     }
11167     __ ret(lr);
11168   }
11169 
11170   void generate_atomic_entry_points() {
11171     if (! UseLSE) {
11172       return;
11173     }
11174     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
11175     GrowableArray<address> entries;
11176     int entry_count = StubInfo::entry_count(stub_id);
11177     address start = load_archive_data(stub_id, &entries);
11178     if (start != nullptr) {
11179       assert(entries.length() == entry_count - 1,
11180              "unexpected extra entry count %d", entries.length());
11181       aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
11182       int idx = 0;
11183       aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11184       aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11185       aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11186       aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11187       aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11188       aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11189       aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11190       aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11191       aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11192       aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11193       aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11194       aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11195       aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11196       aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11197       aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11198       assert(idx == entries.length(), "sanity!");
11199       return;
11200     }
11201 
11202     __ align(CodeEntryAlignment);
11203     StubCodeMark mark(this, stub_id);
11204     start = __ pc();
11205     address end;
11206     {
11207     // ADD, memory_order_conservative
11208     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
11209     gen_ldadd_entry(Assembler::word, memory_order_conservative);
11210 
11211     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
11212     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
11213 
11214     // ADD, memory_order_relaxed
11215     AtomicStubMark mark_fetch_add_4_relaxed
11216       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
11217     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
11218 
11219     AtomicStubMark mark_fetch_add_8_relaxed
11220       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
11221     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
11222 
11223     // XCHG, memory_order_conservative
11224     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
11225     gen_swpal_entry(Assembler::word);
11226 
11227     AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
11228     gen_swpal_entry(Assembler::xword);
11229 
11230     // CAS, memory_order_conservative
11231     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
11232     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
11233 
11234     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
11235     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
11236 
11237     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
11238     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
11239 
11240     // CAS, memory_order_relaxed
11241     AtomicStubMark mark_cmpxchg_1_relaxed
11242       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
11243     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
11244 
11245     AtomicStubMark mark_cmpxchg_4_relaxed
11246       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
11247     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
11248 
11249     AtomicStubMark mark_cmpxchg_8_relaxed
11250       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
11251     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
11252 
11253     AtomicStubMark mark_cmpxchg_4_release
11254       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
11255     gen_cas_entry(MacroAssembler::word, memory_order_release);
11256 
11257     AtomicStubMark mark_cmpxchg_8_release
11258       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
11259     gen_cas_entry(MacroAssembler::xword, memory_order_release);
11260 
11261     AtomicStubMark mark_cmpxchg_4_seq_cst
11262       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
11263     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
11264 
11265     AtomicStubMark mark_cmpxchg_8_seq_cst
11266       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
11267     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
11268 
11269     end = __ pc();
11270 
11271     ICache::invalidate_range(start, end - start);
11272     // exit block to force update of AtomicStubMark targets
11273     }
11274 
11275     assert(start == (address)aarch64_atomic_fetch_add_4_impl,
11276            "atomic stub should be at start of buffer");
11277     // record the stub start and end plus all the entries saved by the
11278     // AtomicStubMark destructor
11279     entries.append((address)aarch64_atomic_fetch_add_8_impl);
11280     entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
11281     entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
11282     entries.append((address)aarch64_atomic_xchg_4_impl);
11283     entries.append((address)aarch64_atomic_xchg_8_impl);
11284     entries.append((address)aarch64_atomic_cmpxchg_1_impl);
11285     entries.append((address)aarch64_atomic_cmpxchg_4_impl);
11286     entries.append((address)aarch64_atomic_cmpxchg_8_impl);
11287     entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
11288     entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
11289     entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
11290     entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
11291     entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
11292     entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
11293     entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
11294 
11295     assert(entries.length() == entry_count - 1,
11296            "unexpected extra entry count %d", entries.length());
11297 
11298     store_archive_data(stub_id, start, end, &entries);
11299   }
11300 #endif // LINUX
11301 
11302   static void save_return_registers(MacroAssembler* masm) {
11303     if (InlineTypeReturnedAsFields) {
11304       masm->push(RegSet::range(r0, r7), sp);
11305       masm->sub(sp, sp, 4 * wordSize);
11306       masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
11307       masm->sub(sp, sp, 4 * wordSize);
11308       masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
11309     } else {
11310       masm->fmovd(rscratch1, v0);
11311       masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
11312     }
11313   }
11314 
11315   static void restore_return_registers(MacroAssembler* masm) {
11316     if (InlineTypeReturnedAsFields) {
11317       masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
11318       masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
11319       masm->pop(RegSet::range(r0, r7), sp);
11320     } else {
11321       masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
11322       masm->fmovd(v0, rscratch1);
11323     }
11324   }
11325 
11326   address generate_cont_thaw(Continuation::thaw_kind kind) {
11327     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
11328     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
11329 
11330     address start = __ pc();
11331 
11332     if (return_barrier) {
11333       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
11334       __ mov(sp, rscratch1);
11335     }
11336     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11337 
11338     if (return_barrier) {
11339       // preserve possible return value from a method returning to the return barrier
11340       save_return_registers(_masm);
11341     }
11342 
11343     __ movw(c_rarg1, (return_barrier ? 1 : 0));
11344     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
11345     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
11346 
11347     if (return_barrier) {
11348       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11349       restore_return_registers(_masm);
11350     }
11351     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11352 
11353 
11354     Label thaw_success;
11355     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
11356     __ cbnz(rscratch2, thaw_success);
11357     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
11358     __ br(rscratch1);
11359     __ bind(thaw_success);
11360 
11361     // make room for the thawed frames
11362     __ sub(rscratch1, sp, rscratch2);
11363     __ andr(rscratch1, rscratch1, -16); // align
11364     __ mov(sp, rscratch1);
11365 
11366     if (return_barrier) {
11367       // save original return value -- again
11368       save_return_registers(_masm);
11369     }
11370 
11371     // If we want, we can templatize thaw by kind, and have three different entries
11372     __ movw(c_rarg1, (uint32_t)kind);
11373 
11374     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
11375     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
11376 
11377     if (return_barrier) {
11378       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11379       restore_return_registers(_masm);
11380     } else {
11381       __ mov(r0, zr); // return 0 (success) from doYield
11382     }
11383 
11384     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
11385     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
11386     __ mov(rfp, sp);
11387 
11388     if (return_barrier_exception) {
11389       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
11390       __ authenticate_return_address(c_rarg1);
11391       __ verify_oop(r0);
11392       // save return value containing the exception oop in callee-saved R19
11393       __ mov(r19, r0);
11394 
11395       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
11396 
11397       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
11398       // __ reinitialize_ptrue();
11399 
11400       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
11401 
11402       __ mov(r1, r0); // the exception handler
11403       __ mov(r0, r19); // restore return value containing the exception oop
11404       __ verify_oop(r0);
11405 
11406       __ leave();
11407       __ mov(r3, lr);
11408       __ br(r1); // the exception handler
11409     } else {
11410       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
11411       __ leave();
11412       __ ret(lr);
11413     }
11414 
11415     return start;
11416   }
11417 
11418   address generate_cont_thaw() {
11419     if (!Continuations::enabled()) return nullptr;
11420 
11421     StubId stub_id = StubId::stubgen_cont_thaw_id;
11422     int entry_count = StubInfo::entry_count(stub_id);
11423     assert(entry_count == 1, "sanity check");
11424     address start = load_archive_data(stub_id);
11425     if (start != nullptr) {
11426       return start;
11427     }
11428     StubCodeMark mark(this, stub_id);
11429     start = __ pc();
11430     generate_cont_thaw(Continuation::thaw_top);
11431 
11432     // record the stub start and end
11433     store_archive_data(stub_id, start, __ pc());
11434 
11435     return start;
11436   }
11437 
11438   address generate_cont_returnBarrier() {
11439     if (!Continuations::enabled()) return nullptr;
11440 
11441     // TODO: will probably need multiple return barriers depending on return type
11442     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
11443     int entry_count = StubInfo::entry_count(stub_id);
11444     assert(entry_count == 1, "sanity check");
11445     address start = load_archive_data(stub_id);
11446     if (start != nullptr) {
11447       return start;
11448     }
11449     StubCodeMark mark(this, stub_id);
11450     start = __ pc();
11451 
11452     generate_cont_thaw(Continuation::thaw_return_barrier);
11453 
11454     // record the stub start and end
11455     store_archive_data(stub_id, start, __ pc());
11456 
11457     return start;
11458   }
11459 
11460   address generate_cont_returnBarrier_exception() {
11461     if (!Continuations::enabled()) return nullptr;
11462 
11463     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
11464     int entry_count = StubInfo::entry_count(stub_id);
11465     assert(entry_count == 1, "sanity check");
11466     address start = load_archive_data(stub_id);
11467     if (start != nullptr) {
11468       return start;
11469     }
11470     StubCodeMark mark(this, stub_id);
11471     start = __ pc();
11472 
11473     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
11474 
11475     // record the stub start and end
11476     store_archive_data(stub_id, start, __ pc());
11477 
11478     return start;
11479   }
11480 
11481   address generate_cont_preempt_stub() {
11482     if (!Continuations::enabled()) return nullptr;
11483     StubId stub_id = StubId::stubgen_cont_preempt_id;
11484     int entry_count = StubInfo::entry_count(stub_id);
11485     assert(entry_count == 1, "sanity check");
11486     address start = load_archive_data(stub_id);
11487     if (start != nullptr) {
11488       return start;
11489     }
11490     StubCodeMark mark(this, stub_id);
11491     start = __ pc();
11492 
11493     __ reset_last_Java_frame(true);
11494 
11495     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
11496     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
11497     __ mov(sp, rscratch2);
11498 
11499     Label preemption_cancelled;
11500     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
11501     __ cbnz(rscratch1, preemption_cancelled);
11502 
11503     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
11504     SharedRuntime::continuation_enter_cleanup(_masm);
11505     __ leave();
11506     __ ret(lr);
11507 
11508     // We acquired the monitor after freezing the frames so call thaw to continue execution.
11509     __ bind(preemption_cancelled);
11510     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
11511     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
11512     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
11513     __ ldr(rscratch1, Address(rscratch1));
11514     __ br(rscratch1);
11515 
11516     // record the stub start and end
11517     store_archive_data(stub_id, start, __ pc());
11518 
11519     return start;
11520   }
11521 
11522   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
11523   // are represented as long[5], with BITS_PER_LIMB = 26.
11524   // Pack five 26-bit limbs into three 64-bit registers.
11525   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
11526     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
11527     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
11528     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
11529     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
11530 
11531     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
11532     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
11533     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
11534     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
11535 
11536     if (dest2->is_valid()) {
11537       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
11538     } else {
11539 #ifdef ASSERT
11540       Label OK;
11541       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
11542       __ br(__ EQ, OK);
11543       __ stop("high bits of Poly1305 integer should be zero");
11544       __ should_not_reach_here();
11545       __ bind(OK);
11546 #endif
11547     }
11548   }
11549 
11550   // As above, but return only a 128-bit integer, packed into two
11551   // 64-bit registers.
11552   void pack_26(Register dest0, Register dest1, Register src) {
11553     pack_26(dest0, dest1, noreg, src);
11554   }
11555 
11556   // Multiply and multiply-accumulate unsigned 64-bit registers.
11557   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
11558     __ mul(prod_lo, n, m);
11559     __ umulh(prod_hi, n, m);
11560   }
11561   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
11562     wide_mul(rscratch1, rscratch2, n, m);
11563     __ adds(sum_lo, sum_lo, rscratch1);
11564     __ adc(sum_hi, sum_hi, rscratch2);
11565   }
11566 
11567   // Poly1305, RFC 7539
11568 
11569   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
11570   // description of the tricks used to simplify and accelerate this
11571   // computation.
11572 
11573   address generate_poly1305_processBlocks() {
11574     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
11575     int entry_count = StubInfo::entry_count(stub_id);
11576     assert(entry_count == 1, "sanity check");
11577     address start = load_archive_data(stub_id);
11578     if (start != nullptr) {
11579       return start;
11580     }
11581     __ align(CodeEntryAlignment);
11582     StubCodeMark mark(this, stub_id);
11583     start = __ pc();
11584     Label here;
11585     __ enter();
11586     RegSet callee_saved = RegSet::range(r19, r28);
11587     __ push(callee_saved, sp);
11588 
11589     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
11590 
11591     // Arguments
11592     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
11593 
11594     // R_n is the 128-bit randomly-generated key, packed into two
11595     // registers.  The caller passes this key to us as long[5], with
11596     // BITS_PER_LIMB = 26.
11597     const Register R_0 = *++regs, R_1 = *++regs;
11598     pack_26(R_0, R_1, r_start);
11599 
11600     // RR_n is (R_n >> 2) * 5
11601     const Register RR_0 = *++regs, RR_1 = *++regs;
11602     __ lsr(RR_0, R_0, 2);
11603     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
11604     __ lsr(RR_1, R_1, 2);
11605     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
11606 
11607     // U_n is the current checksum
11608     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
11609     pack_26(U_0, U_1, U_2, acc_start);
11610 
11611     static constexpr int BLOCK_LENGTH = 16;
11612     Label DONE, LOOP;
11613 
11614     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11615     __ br(Assembler::LT, DONE); {
11616       __ bind(LOOP);
11617 
11618       // S_n is to be the sum of U_n and the next block of data
11619       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
11620       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
11621       __ adds(S_0, U_0, S_0);
11622       __ adcs(S_1, U_1, S_1);
11623       __ adc(S_2, U_2, zr);
11624       __ add(S_2, S_2, 1);
11625 
11626       const Register U_0HI = *++regs, U_1HI = *++regs;
11627 
11628       // NB: this logic depends on some of the special properties of
11629       // Poly1305 keys. In particular, because we know that the top
11630       // four bits of R_0 and R_1 are zero, we can add together
11631       // partial products without any risk of needing to propagate a
11632       // carry out.
11633       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
11634       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
11635       __ andr(U_2, R_0, 3);
11636       __ mul(U_2, S_2, U_2);
11637 
11638       // Recycle registers S_0, S_1, S_2
11639       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
11640 
11641       // Partial reduction mod 2**130 - 5
11642       __ adds(U_1, U_0HI, U_1);
11643       __ adc(U_2, U_1HI, U_2);
11644       // Sum now in U_2:U_1:U_0.
11645       // Dead: U_0HI, U_1HI.
11646       regs = (regs.remaining() + U_0HI + U_1HI).begin();
11647 
11648       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
11649 
11650       // First, U_2:U_1:U_0 += (U_2 >> 2)
11651       __ lsr(rscratch1, U_2, 2);
11652       __ andr(U_2, U_2, (u8)3);
11653       __ adds(U_0, U_0, rscratch1);
11654       __ adcs(U_1, U_1, zr);
11655       __ adc(U_2, U_2, zr);
11656       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
11657       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
11658       __ adcs(U_1, U_1, zr);
11659       __ adc(U_2, U_2, zr);
11660 
11661       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
11662       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11663       __ br(~ Assembler::LT, LOOP);
11664     }
11665 
11666     // Further reduce modulo 2^130 - 5
11667     __ lsr(rscratch1, U_2, 2);
11668     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
11669     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
11670     __ adcs(U_1, U_1, zr);
11671     __ andr(U_2, U_2, (u1)3);
11672     __ adc(U_2, U_2, zr);
11673 
11674     // Unpack the sum into five 26-bit limbs and write to memory.
11675     __ ubfiz(rscratch1, U_0, 0, 26);
11676     __ ubfx(rscratch2, U_0, 26, 26);
11677     __ stp(rscratch1, rscratch2, Address(acc_start));
11678     __ ubfx(rscratch1, U_0, 52, 12);
11679     __ bfi(rscratch1, U_1, 12, 14);
11680     __ ubfx(rscratch2, U_1, 14, 26);
11681     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
11682     __ ubfx(rscratch1, U_1, 40, 24);
11683     __ bfi(rscratch1, U_2, 24, 3);
11684     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
11685 
11686     __ bind(DONE);
11687     __ pop(callee_saved, sp);
11688     __ leave();
11689     __ ret(lr);
11690 
11691     // record the stub start and end
11692     store_archive_data(stub_id, start, __ pc());
11693 
11694     return start;
11695   }
11696 
11697   // exception handler for upcall stubs
11698   address generate_upcall_stub_exception_handler() {
11699     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
11700     int entry_count = StubInfo::entry_count(stub_id);
11701     assert(entry_count == 1, "sanity check");
11702     address start = load_archive_data(stub_id);
11703     if (start != nullptr) {
11704       return start;
11705     }
11706     StubCodeMark mark(this, stub_id);
11707     start = __ pc();
11708 
11709     // Native caller has no idea how to handle exceptions,
11710     // so we just crash here. Up to callee to catch exceptions.
11711     __ verify_oop(r0);
11712     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
11713     __ blr(rscratch1);
11714     __ should_not_reach_here();
11715 
11716     // record the stub start and end
11717     store_archive_data(stub_id, start, __ pc());
11718 
11719     return start;
11720   }
11721 
11722   // load Method* target of MethodHandle
11723   // j_rarg0 = jobject receiver
11724   // rmethod = result
11725   address generate_upcall_stub_load_target() {
11726     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
11727     int entry_count = StubInfo::entry_count(stub_id);
11728     assert(entry_count == 1, "sanity check");
11729     address start = load_archive_data(stub_id);
11730     if (start != nullptr) {
11731       return start;
11732     }
11733     StubCodeMark mark(this, stub_id);
11734     start = __ pc();
11735 
11736     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
11737       // Load target method from receiver
11738     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
11739     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
11740     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
11741     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
11742                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
11743                       noreg, noreg);
11744     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
11745 
11746     __ ret(lr);
11747 
11748     // record the stub start and end
11749     store_archive_data(stub_id, start, __ pc());
11750 
11751     return start;
11752   }
11753 
11754 #undef __
11755 #define __ masm->
11756 
11757   class MontgomeryMultiplyGenerator : public MacroAssembler {
11758 
11759     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
11760       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
11761 
11762     RegSet _toSave;
11763     bool _squaring;
11764 
11765   public:
11766     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
11767       : MacroAssembler(as->code()), _squaring(squaring) {
11768 
11769       // Register allocation
11770 
11771       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
11772       Pa_base = *regs;       // Argument registers
11773       if (squaring)
11774         Pb_base = Pa_base;
11775       else
11776         Pb_base = *++regs;
11777       Pn_base = *++regs;
11778       Rlen= *++regs;
11779       inv = *++regs;
11780       Pm_base = *++regs;
11781 
11782                           // Working registers:
11783       Ra =  *++regs;        // The current digit of a, b, n, and m.
11784       Rb =  *++regs;
11785       Rm =  *++regs;
11786       Rn =  *++regs;
11787 
11788       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
11789       Pb =  *++regs;
11790       Pm =  *++regs;
11791       Pn =  *++regs;
11792 
11793       t0 =  *++regs;        // Three registers which form a
11794       t1 =  *++regs;        // triple-precision accumuator.
11795       t2 =  *++regs;
11796 
11797       Ri =  *++regs;        // Inner and outer loop indexes.
11798       Rj =  *++regs;
11799 
11800       Rhi_ab = *++regs;     // Product registers: low and high parts
11801       Rlo_ab = *++regs;     // of a*b and m*n.
11802       Rhi_mn = *++regs;
11803       Rlo_mn = *++regs;
11804 
11805       // r19 and up are callee-saved.
11806       _toSave = RegSet::range(r19, *regs) + Pm_base;
11807     }
11808 
11809   private:
11810     void save_regs() {
11811       push(_toSave, sp);
11812     }
11813 
11814     void restore_regs() {
11815       pop(_toSave, sp);
11816     }
11817 
11818     template <typename T>
11819     void unroll_2(Register count, T block) {
11820       Label loop, end, odd;
11821       tbnz(count, 0, odd);
11822       cbz(count, end);
11823       align(16);
11824       bind(loop);
11825       (this->*block)();
11826       bind(odd);
11827       (this->*block)();
11828       subs(count, count, 2);
11829       br(Assembler::GT, loop);
11830       bind(end);
11831     }
11832 
11833     template <typename T>
11834     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11835       Label loop, end, odd;
11836       tbnz(count, 0, odd);
11837       cbz(count, end);
11838       align(16);
11839       bind(loop);
11840       (this->*block)(d, s, tmp);
11841       bind(odd);
11842       (this->*block)(d, s, tmp);
11843       subs(count, count, 2);
11844       br(Assembler::GT, loop);
11845       bind(end);
11846     }
11847 
11848     void pre1(RegisterOrConstant i) {
11849       block_comment("pre1");
11850       // Pa = Pa_base;
11851       // Pb = Pb_base + i;
11852       // Pm = Pm_base;
11853       // Pn = Pn_base + i;
11854       // Ra = *Pa;
11855       // Rb = *Pb;
11856       // Rm = *Pm;
11857       // Rn = *Pn;
11858       ldr(Ra, Address(Pa_base));
11859       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11860       ldr(Rm, Address(Pm_base));
11861       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11862       lea(Pa, Address(Pa_base));
11863       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11864       lea(Pm, Address(Pm_base));
11865       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11866 
11867       // Zero the m*n result.
11868       mov(Rhi_mn, zr);
11869       mov(Rlo_mn, zr);
11870     }
11871 
11872     // The core multiply-accumulate step of a Montgomery
11873     // multiplication.  The idea is to schedule operations as a
11874     // pipeline so that instructions with long latencies (loads and
11875     // multiplies) have time to complete before their results are
11876     // used.  This most benefits in-order implementations of the
11877     // architecture but out-of-order ones also benefit.
11878     void step() {
11879       block_comment("step");
11880       // MACC(Ra, Rb, t0, t1, t2);
11881       // Ra = *++Pa;
11882       // Rb = *--Pb;
11883       umulh(Rhi_ab, Ra, Rb);
11884       mul(Rlo_ab, Ra, Rb);
11885       ldr(Ra, pre(Pa, wordSize));
11886       ldr(Rb, pre(Pb, -wordSize));
11887       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11888                                        // previous iteration.
11889       // MACC(Rm, Rn, t0, t1, t2);
11890       // Rm = *++Pm;
11891       // Rn = *--Pn;
11892       umulh(Rhi_mn, Rm, Rn);
11893       mul(Rlo_mn, Rm, Rn);
11894       ldr(Rm, pre(Pm, wordSize));
11895       ldr(Rn, pre(Pn, -wordSize));
11896       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11897     }
11898 
11899     void post1() {
11900       block_comment("post1");
11901 
11902       // MACC(Ra, Rb, t0, t1, t2);
11903       // Ra = *++Pa;
11904       // Rb = *--Pb;
11905       umulh(Rhi_ab, Ra, Rb);
11906       mul(Rlo_ab, Ra, Rb);
11907       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11908       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11909 
11910       // *Pm = Rm = t0 * inv;
11911       mul(Rm, t0, inv);
11912       str(Rm, Address(Pm));
11913 
11914       // MACC(Rm, Rn, t0, t1, t2);
11915       // t0 = t1; t1 = t2; t2 = 0;
11916       umulh(Rhi_mn, Rm, Rn);
11917 
11918 #ifndef PRODUCT
11919       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11920       {
11921         mul(Rlo_mn, Rm, Rn);
11922         add(Rlo_mn, t0, Rlo_mn);
11923         Label ok;
11924         cbz(Rlo_mn, ok); {
11925           stop("broken Montgomery multiply");
11926         } bind(ok);
11927       }
11928 #endif
11929       // We have very carefully set things up so that
11930       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11931       // the lower half of Rm * Rn because we know the result already:
11932       // it must be -t0.  t0 + (-t0) must generate a carry iff
11933       // t0 != 0.  So, rather than do a mul and an adds we just set
11934       // the carry flag iff t0 is nonzero.
11935       //
11936       // mul(Rlo_mn, Rm, Rn);
11937       // adds(zr, t0, Rlo_mn);
11938       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11939       adcs(t0, t1, Rhi_mn);
11940       adc(t1, t2, zr);
11941       mov(t2, zr);
11942     }
11943 
11944     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11945       block_comment("pre2");
11946       // Pa = Pa_base + i-len;
11947       // Pb = Pb_base + len;
11948       // Pm = Pm_base + i-len;
11949       // Pn = Pn_base + len;
11950 
11951       if (i.is_register()) {
11952         sub(Rj, i.as_register(), len);
11953       } else {
11954         mov(Rj, i.as_constant());
11955         sub(Rj, Rj, len);
11956       }
11957       // Rj == i-len
11958 
11959       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11960       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11961       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11962       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11963 
11964       // Ra = *++Pa;
11965       // Rb = *--Pb;
11966       // Rm = *++Pm;
11967       // Rn = *--Pn;
11968       ldr(Ra, pre(Pa, wordSize));
11969       ldr(Rb, pre(Pb, -wordSize));
11970       ldr(Rm, pre(Pm, wordSize));
11971       ldr(Rn, pre(Pn, -wordSize));
11972 
11973       mov(Rhi_mn, zr);
11974       mov(Rlo_mn, zr);
11975     }
11976 
11977     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11978       block_comment("post2");
11979       if (i.is_constant()) {
11980         mov(Rj, i.as_constant()-len.as_constant());
11981       } else {
11982         sub(Rj, i.as_register(), len);
11983       }
11984 
11985       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11986 
11987       // As soon as we know the least significant digit of our result,
11988       // store it.
11989       // Pm_base[i-len] = t0;
11990       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11991 
11992       // t0 = t1; t1 = t2; t2 = 0;
11993       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11994       adc(t1, t2, zr);
11995       mov(t2, zr);
11996     }
11997 
11998     // A carry in t0 after Montgomery multiplication means that we
11999     // should subtract multiples of n from our result in m.  We'll
12000     // keep doing that until there is no carry.
12001     void normalize(RegisterOrConstant len) {
12002       block_comment("normalize");
12003       // while (t0)
12004       //   t0 = sub(Pm_base, Pn_base, t0, len);
12005       Label loop, post, again;
12006       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
12007       cbz(t0, post); {
12008         bind(again); {
12009           mov(i, zr);
12010           mov(cnt, len);
12011           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12012           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12013           subs(zr, zr, zr); // set carry flag, i.e. no borrow
12014           align(16);
12015           bind(loop); {
12016             sbcs(Rm, Rm, Rn);
12017             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12018             add(i, i, 1);
12019             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12020             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12021             sub(cnt, cnt, 1);
12022           } cbnz(cnt, loop);
12023           sbc(t0, t0, zr);
12024         } cbnz(t0, again);
12025       } bind(post);
12026     }
12027 
12028     // Move memory at s to d, reversing words.
12029     //    Increments d to end of copied memory
12030     //    Destroys tmp1, tmp2
12031     //    Preserves len
12032     //    Leaves s pointing to the address which was in d at start
12033     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
12034       assert(tmp1->encoding() < r19->encoding(), "register corruption");
12035       assert(tmp2->encoding() < r19->encoding(), "register corruption");
12036 
12037       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
12038       mov(tmp1, len);
12039       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
12040       sub(s, d, len, ext::uxtw, LogBytesPerWord);
12041     }
12042     // where
12043     void reverse1(Register d, Register s, Register tmp) {
12044       ldr(tmp, pre(s, -wordSize));
12045       ror(tmp, tmp, 32);
12046       str(tmp, post(d, wordSize));
12047     }
12048 
12049     void step_squaring() {
12050       // An extra ACC
12051       step();
12052       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12053     }
12054 
12055     void last_squaring(RegisterOrConstant i) {
12056       Label dont;
12057       // if ((i & 1) == 0) {
12058       tbnz(i.as_register(), 0, dont); {
12059         // MACC(Ra, Rb, t0, t1, t2);
12060         // Ra = *++Pa;
12061         // Rb = *--Pb;
12062         umulh(Rhi_ab, Ra, Rb);
12063         mul(Rlo_ab, Ra, Rb);
12064         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12065       } bind(dont);
12066     }
12067 
12068     void extra_step_squaring() {
12069       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
12070 
12071       // MACC(Rm, Rn, t0, t1, t2);
12072       // Rm = *++Pm;
12073       // Rn = *--Pn;
12074       umulh(Rhi_mn, Rm, Rn);
12075       mul(Rlo_mn, Rm, Rn);
12076       ldr(Rm, pre(Pm, wordSize));
12077       ldr(Rn, pre(Pn, -wordSize));
12078     }
12079 
12080     void post1_squaring() {
12081       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
12082 
12083       // *Pm = Rm = t0 * inv;
12084       mul(Rm, t0, inv);
12085       str(Rm, Address(Pm));
12086 
12087       // MACC(Rm, Rn, t0, t1, t2);
12088       // t0 = t1; t1 = t2; t2 = 0;
12089       umulh(Rhi_mn, Rm, Rn);
12090 
12091 #ifndef PRODUCT
12092       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12093       {
12094         mul(Rlo_mn, Rm, Rn);
12095         add(Rlo_mn, t0, Rlo_mn);
12096         Label ok;
12097         cbz(Rlo_mn, ok); {
12098           stop("broken Montgomery multiply");
12099         } bind(ok);
12100       }
12101 #endif
12102       // We have very carefully set things up so that
12103       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
12104       // the lower half of Rm * Rn because we know the result already:
12105       // it must be -t0.  t0 + (-t0) must generate a carry iff
12106       // t0 != 0.  So, rather than do a mul and an adds we just set
12107       // the carry flag iff t0 is nonzero.
12108       //
12109       // mul(Rlo_mn, Rm, Rn);
12110       // adds(zr, t0, Rlo_mn);
12111       subs(zr, t0, 1); // Set carry iff t0 is nonzero
12112       adcs(t0, t1, Rhi_mn);
12113       adc(t1, t2, zr);
12114       mov(t2, zr);
12115     }
12116 
12117     void acc(Register Rhi, Register Rlo,
12118              Register t0, Register t1, Register t2) {
12119       adds(t0, t0, Rlo);
12120       adcs(t1, t1, Rhi);
12121       adc(t2, t2, zr);
12122     }
12123 
12124   public:
12125     /**
12126      * Fast Montgomery multiplication.  The derivation of the
12127      * algorithm is in A Cryptographic Library for the Motorola
12128      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
12129      *
12130      * Arguments:
12131      *
12132      * Inputs for multiplication:
12133      *   c_rarg0   - int array elements a
12134      *   c_rarg1   - int array elements b
12135      *   c_rarg2   - int array elements n (the modulus)
12136      *   c_rarg3   - int length
12137      *   c_rarg4   - int inv
12138      *   c_rarg5   - int array elements m (the result)
12139      *
12140      * Inputs for squaring:
12141      *   c_rarg0   - int array elements a
12142      *   c_rarg1   - int array elements n (the modulus)
12143      *   c_rarg2   - int length
12144      *   c_rarg3   - int inv
12145      *   c_rarg4   - int array elements m (the result)
12146      *
12147      */
12148     address generate_multiply() {
12149       Label argh, nothing;
12150 
12151       align(CodeEntryAlignment);
12152       address entry = pc();
12153 
12154       cbzw(Rlen, nothing);
12155 
12156       enter();
12157 
12158       // Make room.
12159       cmpw(Rlen, 512);
12160       br(Assembler::HI, argh);
12161       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12162       andr(sp, Ra, -2 * wordSize);
12163 
12164       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
12165 
12166       {
12167         // Copy input args, reversing as we go.  We use Ra as a
12168         // temporary variable.
12169         reverse(Ra, Pa_base, Rlen, t0, t1);
12170         if (!_squaring)
12171           reverse(Ra, Pb_base, Rlen, t0, t1);
12172         reverse(Ra, Pn_base, Rlen, t0, t1);
12173       }
12174 
12175       // Push all call-saved registers and also Pm_base which we'll need
12176       // at the end.
12177       save_regs();
12178 
12179 #ifndef PRODUCT
12180       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
12181       {
12182         ldr(Rn, Address(Pn_base, 0));
12183         mul(Rlo_mn, Rn, inv);
12184         subs(zr, Rlo_mn, -1);
12185         Label ok;
12186         br(EQ, ok); {
12187           stop("broken inverse in Montgomery multiply");
12188         } bind(ok);
12189       }
12190 #endif
12191 
12192       mov(Pm_base, Ra);
12193 
12194       mov(t0, zr);
12195       mov(t1, zr);
12196       mov(t2, zr);
12197 
12198       block_comment("for (int i = 0; i < len; i++) {");
12199       mov(Ri, zr); {
12200         Label loop, end;
12201         cmpw(Ri, Rlen);
12202         br(Assembler::GE, end);
12203 
12204         bind(loop);
12205         pre1(Ri);
12206 
12207         block_comment("  for (j = i; j; j--) {"); {
12208           movw(Rj, Ri);
12209           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12210         } block_comment("  } // j");
12211 
12212         post1();
12213         addw(Ri, Ri, 1);
12214         cmpw(Ri, Rlen);
12215         br(Assembler::LT, loop);
12216         bind(end);
12217         block_comment("} // i");
12218       }
12219 
12220       block_comment("for (int i = len; i < 2*len; i++) {");
12221       mov(Ri, Rlen); {
12222         Label loop, end;
12223         cmpw(Ri, Rlen, Assembler::LSL, 1);
12224         br(Assembler::GE, end);
12225 
12226         bind(loop);
12227         pre2(Ri, Rlen);
12228 
12229         block_comment("  for (j = len*2-i-1; j; j--) {"); {
12230           lslw(Rj, Rlen, 1);
12231           subw(Rj, Rj, Ri);
12232           subw(Rj, Rj, 1);
12233           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12234         } block_comment("  } // j");
12235 
12236         post2(Ri, Rlen);
12237         addw(Ri, Ri, 1);
12238         cmpw(Ri, Rlen, Assembler::LSL, 1);
12239         br(Assembler::LT, loop);
12240         bind(end);
12241       }
12242       block_comment("} // i");
12243 
12244       normalize(Rlen);
12245 
12246       mov(Ra, Pm_base);  // Save Pm_base in Ra
12247       restore_regs();  // Restore caller's Pm_base
12248 
12249       // Copy our result into caller's Pm_base
12250       reverse(Pm_base, Ra, Rlen, t0, t1);
12251 
12252       leave();
12253       bind(nothing);
12254       ret(lr);
12255 
12256       // handler for error case
12257       bind(argh);
12258       stop("MontgomeryMultiply total_allocation must be <= 8192");
12259 
12260       return entry;
12261     }
12262     // In C, approximately:
12263 
12264     // void
12265     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
12266     //                     julong Pn_base[], julong Pm_base[],
12267     //                     julong inv, int len) {
12268     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12269     //   julong *Pa, *Pb, *Pn, *Pm;
12270     //   julong Ra, Rb, Rn, Rm;
12271 
12272     //   int i;
12273 
12274     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12275 
12276     //   for (i = 0; i < len; i++) {
12277     //     int j;
12278 
12279     //     Pa = Pa_base;
12280     //     Pb = Pb_base + i;
12281     //     Pm = Pm_base;
12282     //     Pn = Pn_base + i;
12283 
12284     //     Ra = *Pa;
12285     //     Rb = *Pb;
12286     //     Rm = *Pm;
12287     //     Rn = *Pn;
12288 
12289     //     int iters = i;
12290     //     for (j = 0; iters--; j++) {
12291     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12292     //       MACC(Ra, Rb, t0, t1, t2);
12293     //       Ra = *++Pa;
12294     //       Rb = *--Pb;
12295     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12296     //       MACC(Rm, Rn, t0, t1, t2);
12297     //       Rm = *++Pm;
12298     //       Rn = *--Pn;
12299     //     }
12300 
12301     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
12302     //     MACC(Ra, Rb, t0, t1, t2);
12303     //     *Pm = Rm = t0 * inv;
12304     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12305     //     MACC(Rm, Rn, t0, t1, t2);
12306 
12307     //     assert(t0 == 0, "broken Montgomery multiply");
12308 
12309     //     t0 = t1; t1 = t2; t2 = 0;
12310     //   }
12311 
12312     //   for (i = len; i < 2*len; i++) {
12313     //     int j;
12314 
12315     //     Pa = Pa_base + i-len;
12316     //     Pb = Pb_base + len;
12317     //     Pm = Pm_base + i-len;
12318     //     Pn = Pn_base + len;
12319 
12320     //     Ra = *++Pa;
12321     //     Rb = *--Pb;
12322     //     Rm = *++Pm;
12323     //     Rn = *--Pn;
12324 
12325     //     int iters = len*2-i-1;
12326     //     for (j = i-len+1; iters--; j++) {
12327     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12328     //       MACC(Ra, Rb, t0, t1, t2);
12329     //       Ra = *++Pa;
12330     //       Rb = *--Pb;
12331     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12332     //       MACC(Rm, Rn, t0, t1, t2);
12333     //       Rm = *++Pm;
12334     //       Rn = *--Pn;
12335     //     }
12336 
12337     //     Pm_base[i-len] = t0;
12338     //     t0 = t1; t1 = t2; t2 = 0;
12339     //   }
12340 
12341     //   while (t0)
12342     //     t0 = sub(Pm_base, Pn_base, t0, len);
12343     // }
12344 
12345     /**
12346      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
12347      * multiplies than Montgomery multiplication so it should be up to
12348      * 25% faster.  However, its loop control is more complex and it
12349      * may actually run slower on some machines.
12350      *
12351      * Arguments:
12352      *
12353      * Inputs:
12354      *   c_rarg0   - int array elements a
12355      *   c_rarg1   - int array elements n (the modulus)
12356      *   c_rarg2   - int length
12357      *   c_rarg3   - int inv
12358      *   c_rarg4   - int array elements m (the result)
12359      *
12360      */
12361     address generate_square() {
12362       Label argh;
12363 
12364       align(CodeEntryAlignment);
12365       address entry = pc();
12366 
12367       enter();
12368 
12369       // Make room.
12370       cmpw(Rlen, 512);
12371       br(Assembler::HI, argh);
12372       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12373       andr(sp, Ra, -2 * wordSize);
12374 
12375       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
12376 
12377       {
12378         // Copy input args, reversing as we go.  We use Ra as a
12379         // temporary variable.
12380         reverse(Ra, Pa_base, Rlen, t0, t1);
12381         reverse(Ra, Pn_base, Rlen, t0, t1);
12382       }
12383 
12384       // Push all call-saved registers and also Pm_base which we'll need
12385       // at the end.
12386       save_regs();
12387 
12388       mov(Pm_base, Ra);
12389 
12390       mov(t0, zr);
12391       mov(t1, zr);
12392       mov(t2, zr);
12393 
12394       block_comment("for (int i = 0; i < len; i++) {");
12395       mov(Ri, zr); {
12396         Label loop, end;
12397         bind(loop);
12398         cmp(Ri, Rlen);
12399         br(Assembler::GE, end);
12400 
12401         pre1(Ri);
12402 
12403         block_comment("for (j = (i+1)/2; j; j--) {"); {
12404           add(Rj, Ri, 1);
12405           lsr(Rj, Rj, 1);
12406           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12407         } block_comment("  } // j");
12408 
12409         last_squaring(Ri);
12410 
12411         block_comment("  for (j = i/2; j; j--) {"); {
12412           lsr(Rj, Ri, 1);
12413           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12414         } block_comment("  } // j");
12415 
12416         post1_squaring();
12417         add(Ri, Ri, 1);
12418         cmp(Ri, Rlen);
12419         br(Assembler::LT, loop);
12420 
12421         bind(end);
12422         block_comment("} // i");
12423       }
12424 
12425       block_comment("for (int i = len; i < 2*len; i++) {");
12426       mov(Ri, Rlen); {
12427         Label loop, end;
12428         bind(loop);
12429         cmp(Ri, Rlen, Assembler::LSL, 1);
12430         br(Assembler::GE, end);
12431 
12432         pre2(Ri, Rlen);
12433 
12434         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
12435           lsl(Rj, Rlen, 1);
12436           sub(Rj, Rj, Ri);
12437           sub(Rj, Rj, 1);
12438           lsr(Rj, Rj, 1);
12439           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12440         } block_comment("  } // j");
12441 
12442         last_squaring(Ri);
12443 
12444         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
12445           lsl(Rj, Rlen, 1);
12446           sub(Rj, Rj, Ri);
12447           lsr(Rj, Rj, 1);
12448           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12449         } block_comment("  } // j");
12450 
12451         post2(Ri, Rlen);
12452         add(Ri, Ri, 1);
12453         cmp(Ri, Rlen, Assembler::LSL, 1);
12454 
12455         br(Assembler::LT, loop);
12456         bind(end);
12457         block_comment("} // i");
12458       }
12459 
12460       normalize(Rlen);
12461 
12462       mov(Ra, Pm_base);  // Save Pm_base in Ra
12463       restore_regs();  // Restore caller's Pm_base
12464 
12465       // Copy our result into caller's Pm_base
12466       reverse(Pm_base, Ra, Rlen, t0, t1);
12467 
12468       leave();
12469       ret(lr);
12470 
12471       // handler for error case
12472       bind(argh);
12473       stop("MontgomeryMultiply total_allocation must be <= 8192");
12474 
12475       return entry;
12476     }
12477     // In C, approximately:
12478 
12479     // void
12480     // montgomery_square(julong Pa_base[], julong Pn_base[],
12481     //                   julong Pm_base[], julong inv, int len) {
12482     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12483     //   julong *Pa, *Pb, *Pn, *Pm;
12484     //   julong Ra, Rb, Rn, Rm;
12485 
12486     //   int i;
12487 
12488     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12489 
12490     //   for (i = 0; i < len; i++) {
12491     //     int j;
12492 
12493     //     Pa = Pa_base;
12494     //     Pb = Pa_base + i;
12495     //     Pm = Pm_base;
12496     //     Pn = Pn_base + i;
12497 
12498     //     Ra = *Pa;
12499     //     Rb = *Pb;
12500     //     Rm = *Pm;
12501     //     Rn = *Pn;
12502 
12503     //     int iters = (i+1)/2;
12504     //     for (j = 0; iters--; j++) {
12505     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12506     //       MACC2(Ra, Rb, t0, t1, t2);
12507     //       Ra = *++Pa;
12508     //       Rb = *--Pb;
12509     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12510     //       MACC(Rm, Rn, t0, t1, t2);
12511     //       Rm = *++Pm;
12512     //       Rn = *--Pn;
12513     //     }
12514     //     if ((i & 1) == 0) {
12515     //       assert(Ra == Pa_base[j], "must be");
12516     //       MACC(Ra, Ra, t0, t1, t2);
12517     //     }
12518     //     iters = i/2;
12519     //     assert(iters == i-j, "must be");
12520     //     for (; iters--; j++) {
12521     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12522     //       MACC(Rm, Rn, t0, t1, t2);
12523     //       Rm = *++Pm;
12524     //       Rn = *--Pn;
12525     //     }
12526 
12527     //     *Pm = Rm = t0 * inv;
12528     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12529     //     MACC(Rm, Rn, t0, t1, t2);
12530 
12531     //     assert(t0 == 0, "broken Montgomery multiply");
12532 
12533     //     t0 = t1; t1 = t2; t2 = 0;
12534     //   }
12535 
12536     //   for (i = len; i < 2*len; i++) {
12537     //     int start = i-len+1;
12538     //     int end = start + (len - start)/2;
12539     //     int j;
12540 
12541     //     Pa = Pa_base + i-len;
12542     //     Pb = Pa_base + len;
12543     //     Pm = Pm_base + i-len;
12544     //     Pn = Pn_base + len;
12545 
12546     //     Ra = *++Pa;
12547     //     Rb = *--Pb;
12548     //     Rm = *++Pm;
12549     //     Rn = *--Pn;
12550 
12551     //     int iters = (2*len-i-1)/2;
12552     //     assert(iters == end-start, "must be");
12553     //     for (j = start; iters--; j++) {
12554     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12555     //       MACC2(Ra, Rb, t0, t1, t2);
12556     //       Ra = *++Pa;
12557     //       Rb = *--Pb;
12558     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12559     //       MACC(Rm, Rn, t0, t1, t2);
12560     //       Rm = *++Pm;
12561     //       Rn = *--Pn;
12562     //     }
12563     //     if ((i & 1) == 0) {
12564     //       assert(Ra == Pa_base[j], "must be");
12565     //       MACC(Ra, Ra, t0, t1, t2);
12566     //     }
12567     //     iters =  (2*len-i)/2;
12568     //     assert(iters == len-j, "must be");
12569     //     for (; iters--; j++) {
12570     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12571     //       MACC(Rm, Rn, t0, t1, t2);
12572     //       Rm = *++Pm;
12573     //       Rn = *--Pn;
12574     //     }
12575     //     Pm_base[i-len] = t0;
12576     //     t0 = t1; t1 = t2; t2 = 0;
12577     //   }
12578 
12579     //   while (t0)
12580     //     t0 = sub(Pm_base, Pn_base, t0, len);
12581     // }
12582   };
12583 
12584   // Call here from the interpreter or compiled code to either load
12585   // multiple returned values from the inline type instance being
12586   // returned to registers or to store returned values to a newly
12587   // allocated inline type instance.
12588   address generate_return_value_stub(address destination, const char* name, bool has_res) {
12589     // We need to save all registers the calling convention may use so
12590     // the runtime calls read or update those registers. This needs to
12591     // be in sync with SharedRuntime::java_return_convention().
12592     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
12593     enum layout {
12594       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
12595       j_rarg6_off, j_rarg6_2,
12596       j_rarg5_off, j_rarg5_2,
12597       j_rarg4_off, j_rarg4_2,
12598       j_rarg3_off, j_rarg3_2,
12599       j_rarg2_off, j_rarg2_2,
12600       j_rarg1_off, j_rarg1_2,
12601       j_rarg0_off, j_rarg0_2,
12602 
12603       j_farg7_off, j_farg7_2,
12604       j_farg6_off, j_farg6_2,
12605       j_farg5_off, j_farg5_2,
12606       j_farg4_off, j_farg4_2,
12607       j_farg3_off, j_farg3_2,
12608       j_farg2_off, j_farg2_2,
12609       j_farg1_off, j_farg1_2,
12610       j_farg0_off, j_farg0_2,
12611 
12612       rfp_off, rfp_off2,
12613       return_off, return_off2,
12614 
12615       framesize // inclusive of return address
12616     };
12617 
12618     CodeBuffer code(name, 512, 64);
12619     MacroAssembler* masm = new MacroAssembler(&code);
12620 
12621     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
12622     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
12623     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
12624     int frame_size_in_words = frame_size_in_bytes / wordSize;
12625 
12626     OopMapSet* oop_maps = new OopMapSet();
12627     OopMap* map = new OopMap(frame_size_in_slots, 0);
12628 
12629     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
12630     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
12631     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
12632     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
12633     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
12634     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
12635     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
12636     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
12637 
12638     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
12639     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
12640     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
12641     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
12642     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
12643     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
12644     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
12645     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
12646 
12647     address start = __ pc();
12648 
12649     __ enter(); // Save FP and LR before call
12650 
12651     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
12652     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
12653     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
12654     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
12655 
12656     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
12657     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
12658     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
12659     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
12660 
12661     int frame_complete = __ offset();
12662 
12663     // Set up last_Java_sp and last_Java_fp
12664     address the_pc = __ pc();
12665     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
12666 
12667     // Call runtime
12668     __ mov(c_rarg1, r0);
12669     __ mov(c_rarg0, rthread);
12670 
12671     __ mov(rscratch1, destination);
12672     __ blr(rscratch1);
12673 
12674     oop_maps->add_gc_map(the_pc - start, map);
12675 
12676     __ reset_last_Java_frame(false);
12677 
12678     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
12679     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
12680     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
12681     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
12682 
12683     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
12684     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
12685     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
12686     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
12687 
12688     __ leave();
12689 
12690     // check for pending exceptions
12691     Label pending;
12692     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
12693     __ cbnz(rscratch1, pending);
12694 
12695     if (has_res) {
12696       __ get_vm_result_oop(r0, rthread);
12697     }
12698 
12699     __ ret(lr);
12700 
12701     __ bind(pending);
12702     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
12703 
12704     // -------------
12705     // make sure all code is generated
12706     masm->flush();
12707 
12708     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
12709     return stub->entry_point();
12710   }
12711 
12712   // Initialization
12713   void generate_preuniverse_stubs() {
12714     // preuniverse stubs are not needed for aarch64
12715   }
12716 
12717   void generate_initial_stubs() {
12718     // Generate initial stubs and initializes the entry points
12719 
12720     // entry points that exist in all platforms Note: This is code
12721     // that could be shared among different platforms - however the
12722     // benefit seems to be smaller than the disadvantage of having a
12723     // much more complicated generator structure. See also comment in
12724     // stubRoutines.hpp.
12725 
12726     StubRoutines::_forward_exception_entry = generate_forward_exception();
12727 
12728     StubRoutines::_call_stub_entry =
12729       generate_call_stub(StubRoutines::_call_stub_return_address);
12730 
12731     // is referenced by megamorphic call
12732     StubRoutines::_catch_exception_entry = generate_catch_exception();
12733 
12734     // Initialize table for copy memory (arraycopy) check.
12735     if (UnsafeMemoryAccess::_table == nullptr) {
12736       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
12737     }
12738 
12739     if (UseCRC32Intrinsics) {
12740       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
12741     }
12742 
12743     if (UseCRC32CIntrinsics) {
12744       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
12745     }
12746 
12747     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
12748       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
12749     }
12750 
12751     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
12752       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
12753     }
12754 
12755     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
12756         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
12757       StubRoutines::_hf2f = generate_float16ToFloat();
12758       StubRoutines::_f2hf = generate_floatToFloat16();
12759     }
12760 
12761     if (InlineTypeReturnedAsFields) {
12762       StubRoutines::_load_inline_type_fields_in_regs =
12763          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
12764       StubRoutines::_store_inline_type_fields_to_buf =
12765          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
12766     }
12767 
12768   }
12769 
12770   void generate_continuation_stubs() {
12771     // Continuation stubs:
12772     StubRoutines::_cont_thaw          = generate_cont_thaw();
12773     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
12774     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
12775     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
12776   }
12777 
12778   void generate_final_stubs() {
12779     // support for verify_oop (must happen after universe_init)
12780     if (VerifyOops) {
12781       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
12782     }
12783 
12784     // arraycopy stubs used by compilers
12785     generate_arraycopy_stubs();
12786 
12787     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
12788 
12789     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
12790 
12791     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
12792     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
12793 
12794 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12795 
12796     generate_atomic_entry_points();
12797 
12798 #endif // LINUX
12799 
12800 #ifdef COMPILER2
12801     if (UseSecondarySupersTable) {
12802       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
12803       if (! InlineSecondarySupersTest) {
12804         generate_lookup_secondary_supers_table_stub();
12805       }
12806     }
12807 #endif
12808 
12809     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
12810       StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
12811     }
12812 
12813     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
12814   }
12815 
12816   void generate_compiler_stubs() {
12817 #if COMPILER2_OR_JVMCI
12818 
12819     if (UseSVE == 0) {
12820       generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
12821     }
12822 
12823     // array equals stub for large arrays.
12824     if (!UseSimpleArrayEquals) {
12825       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12826     }
12827 
12828     // arrays_hascode stub for large arrays.
12829     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12830     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12831     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12832     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12833     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12834 
12835     // byte_array_inflate stub for large arrays.
12836     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12837 
12838     // countPositives stub for large arrays.
12839     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12840 
12841     generate_compare_long_strings();
12842 
12843     generate_string_indexof_stubs();
12844 
12845 #ifdef COMPILER2
12846     if (UseMultiplyToLenIntrinsic) {
12847       StubRoutines::_multiplyToLen = generate_multiplyToLen();
12848     }
12849 
12850     if (UseSquareToLenIntrinsic) {
12851       StubRoutines::_squareToLen = generate_squareToLen();
12852     }
12853 
12854     if (UseMulAddIntrinsic) {
12855       StubRoutines::_mulAdd = generate_mulAdd();
12856     }
12857 
12858     if (UseSIMDForBigIntegerShiftIntrinsics) {
12859       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12860       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
12861     }
12862 
12863     if (UseMontgomeryMultiplyIntrinsic) {
12864       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12865       address start = load_archive_data(stub_id);
12866       if (start == nullptr) {
12867         // we have to generate it
12868         StubCodeMark mark(this, stub_id);
12869         MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12870         start = g.generate_multiply();
12871         // record the stub start and end
12872         store_archive_data(stub_id, start, _masm->pc());
12873       }
12874       StubRoutines::_montgomeryMultiply = start;
12875     }
12876 
12877     if (UseMontgomerySquareIntrinsic) {
12878       StubId stub_id = StubId::stubgen_montgomerySquare_id;
12879       address start = load_archive_data(stub_id);
12880       if (start == nullptr) {
12881         // we have to generate it
12882         StubCodeMark mark(this, stub_id);
12883         MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12884         // We use generate_multiply() rather than generate_square()
12885         // because it's faster for the sizes of modulus we care about.
12886         start = g.generate_multiply();
12887         // record the stub start and end
12888         store_archive_data(stub_id, start, _masm->pc());
12889       }
12890       StubRoutines::_montgomerySquare = start;
12891     }
12892 
12893 #endif // COMPILER2
12894 
12895     if (UseChaCha20Intrinsics) {
12896       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12897     }
12898 
12899     if (UseKyberIntrinsics) {
12900       StubRoutines::_kyberNtt = generate_kyberNtt();
12901       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12902       StubRoutines::_kyberNttMult = generate_kyberNttMult();
12903       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12904       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12905       StubRoutines::_kyber12To16 = generate_kyber12To16();
12906       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12907     }
12908 
12909     if (UseDilithiumIntrinsics) {
12910       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12911       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12912       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12913       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12914       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12915     }
12916 
12917     if (UseBASE64Intrinsics) {
12918         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12919         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12920     }
12921 
12922     // data cache line writeback
12923     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12924     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12925 
12926     if (UseAESIntrinsics) {
12927       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12928       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12929       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12930       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12931       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12932     }
12933     if (UseGHASHIntrinsics) {
12934       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12935       StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
12936       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
12937     }
12938     if (UseAESIntrinsics && UseGHASHIntrinsics) {
12939       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12940     }
12941 
12942     if (UseMD5Intrinsics) {
12943       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12944       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12945     }
12946     if (UseSHA1Intrinsics) {
12947       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12948       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12949     }
12950     if (UseSHA256Intrinsics) {
12951       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12952       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12953     }
12954     if (UseSHA512Intrinsics) {
12955       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12956       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12957     }
12958     if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
12959       StubRoutines::_double_keccak         = generate_double_keccak();
12960       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12961       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12962     } else if (UseSHA3Intrinsics) {
12963       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12964       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12965     }
12966 
12967     if (UsePoly1305Intrinsics) {
12968       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12969     }
12970 
12971     // generate Adler32 intrinsics code
12972     if (UseAdler32Intrinsics) {
12973       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12974     }
12975 
12976 #endif // COMPILER2_OR_JVMCI
12977   }
12978 
12979  public:
12980   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
12981     switch(blob_id) {
12982     case BlobId::stubgen_preuniverse_id:
12983       generate_preuniverse_stubs();
12984       break;
12985     case BlobId::stubgen_initial_id:
12986       generate_initial_stubs();
12987       break;
12988      case BlobId::stubgen_continuation_id:
12989       generate_continuation_stubs();
12990       break;
12991     case BlobId::stubgen_compiler_id:
12992       generate_compiler_stubs();
12993       break;
12994     case BlobId::stubgen_final_id:
12995       generate_final_stubs();
12996       break;
12997     default:
12998       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12999       break;
13000     };
13001   }
13002 
13003 #if INCLUDE_CDS
13004   static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
13005     // external data defined in this file
13006 #define ADD(addr) external_addresses.append((address)(addr));
13007     ADD(_sha256_round_consts);
13008     ADD(_sha512_round_consts);
13009     ADD(_sha3_round_consts);
13010     ADD(_double_keccak_round_consts);
13011     ADD(_encodeBlock_toBase64);
13012     ADD(_encodeBlock_toBase64URL);
13013     ADD(_decodeBlock_fromBase64ForNoSIMD);
13014     ADD(_decodeBlock_fromBase64URLForNoSIMD);
13015     ADD(_decodeBlock_fromBase64ForSIMD);
13016     ADD(_decodeBlock_fromBase64URLForSIMD);
13017 #undef ADD
13018   }
13019 #endif // INCLUDE_CDS
13020 }; // end class declaration
13021 
13022 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
13023   StubGenerator g(code, blob_id, stub_data);
13024 }
13025 
13026 #if INCLUDE_CDS
13027 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
13028   StubGenerator::init_AOTAddressTable(addresses);
13029 }
13030 #endif // INCLUDE_CDS
13031 
13032 #if defined (LINUX)
13033 
13034 // Define pointers to atomic stubs and initialize them to point to the
13035 // code in atomic_aarch64.S.
13036 
13037 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
13038   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
13039     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
13040   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
13041     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
13042 
13043 DEFAULT_ATOMIC_OP(fetch_add, 4, )
13044 DEFAULT_ATOMIC_OP(fetch_add, 8, )
13045 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
13046 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
13047 DEFAULT_ATOMIC_OP(xchg, 4, )
13048 DEFAULT_ATOMIC_OP(xchg, 8, )
13049 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
13050 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
13051 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
13052 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
13053 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
13054 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
13055 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
13056 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
13057 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
13058 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
13059 
13060 #undef DEFAULT_ATOMIC_OP
13061 
13062 #endif // LINUX