1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/objArrayKlass.hpp"
  30 #include "runtime/sharedRuntime.hpp"
  31 #include "runtime/stubRoutines.hpp"
  32 #include "stubGenerator_x86_64.hpp"
  33 #ifdef COMPILER2
  34 #include "opto/c2_globals.hpp"
  35 #endif
  36 #if INCLUDE_JVMCI
  37 #include "jvmci/jvmci_globals.hpp"
  38 #endif
  39 
  40 #define __ _masm->
  41 
  42 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #else
  47 #define BLOCK_COMMENT(str) __ block_comment(str)
  48 #endif // PRODUCT
  49 
  50 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  51 
  52 #ifdef PRODUCT
  53 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
  54 #else
  55 #define INC_COUNTER_NP(counter, rscratch) \
  56 BLOCK_COMMENT("inc_counter " #counter); \
  57 inc_counter_np(_masm, counter, rscratch);
  58 
  59 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
  60   __ incrementl(ExternalAddress((address)&counter), rscratch);
  61 }
  62 
  63 #if COMPILER2_OR_JVMCI
  64 static uint& get_profile_ctr(int shift) {
  65   if (shift == 0) {
  66     return SharedRuntime::_jbyte_array_copy_ctr;
  67   } else if (shift == 1) {
  68     return SharedRuntime::_jshort_array_copy_ctr;
  69   } else if (shift == 2) {
  70     return SharedRuntime::_jint_array_copy_ctr;
  71   } else {
  72     assert(shift == 3, "");
  73     return SharedRuntime::_jlong_array_copy_ctr;
  74   }
  75 }
  76 #endif // COMPILER2_OR_JVMCI
  77 #endif // !PRODUCT
  78 
  79 void StubGenerator::generate_arraycopy_stubs() {
  80   address entry;
  81   address entry_jbyte_arraycopy;
  82   address entry_jshort_arraycopy;
  83   address entry_jint_arraycopy;
  84   address entry_oop_arraycopy;
  85   address entry_jlong_arraycopy;
  86   address entry_checkcast_arraycopy;
  87 
  88   StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
  89                                                                          "jbyte_disjoint_arraycopy");
  90   StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
  91                                                                          "jbyte_arraycopy");
  92 
  93   StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
  94                                                                           "jshort_disjoint_arraycopy");
  95   StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
  96                                                                           "jshort_arraycopy");
  97 
  98   StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
  99                                                                             "jint_disjoint_arraycopy");
 100   StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
 101                                                                             &entry_jint_arraycopy, "jint_arraycopy");
 102 
 103   StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
 104                                                                              "jlong_disjoint_arraycopy");
 105   StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
 106                                                                              &entry_jlong_arraycopy, "jlong_arraycopy");
 107   if (UseCompressedOops) {
 108     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
 109                                                                             "oop_disjoint_arraycopy");
 110     StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
 111                                                                             &entry_oop_arraycopy, "oop_arraycopy");
 112     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
 113                                                                                    "oop_disjoint_arraycopy_uninit",
 114                                                                                    /*dest_uninitialized*/true);
 115     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
 116                                                                                    nullptr, "oop_arraycopy_uninit",
 117                                                                                    /*dest_uninitialized*/true);
 118   } else {
 119     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
 120                                                                              "oop_disjoint_arraycopy");
 121     StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
 122                                                                              &entry_oop_arraycopy, "oop_arraycopy");
 123     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
 124                                                                                     "oop_disjoint_arraycopy_uninit",
 125                                                                                     /*dest_uninitialized*/true);
 126     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
 127                                                                                     nullptr, "oop_arraycopy_uninit",
 128                                                                                     /*dest_uninitialized*/true);
 129   }
 130 
 131   StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
 132   StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
 133                                                                       /*dest_uninitialized*/true);
 134 
 135   StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
 136                                                             entry_jbyte_arraycopy,
 137                                                             entry_jshort_arraycopy,
 138                                                             entry_jint_arraycopy,
 139                                                             entry_jlong_arraycopy);
 140   StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
 141                                                              entry_jbyte_arraycopy,
 142                                                              entry_jshort_arraycopy,
 143                                                              entry_jint_arraycopy,
 144                                                              entry_oop_arraycopy,
 145                                                              entry_jlong_arraycopy,
 146                                                              entry_checkcast_arraycopy);
 147 
 148   StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
 149   StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
 150   StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
 151   StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
 152   StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
 153   StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
 154 
 155   // We don't generate specialized code for HeapWord-aligned source
 156   // arrays, so just use the code we've already generated
 157   StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
 158   StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
 159 
 160   StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
 161   StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
 162 
 163   StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
 164   StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
 165 
 166   StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
 167   StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
 168 
 169   StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
 170   StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
 171 
 172   StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
 173   StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
 174 }
 175 
 176 
 177 // Verify that a register contains clean 32-bits positive value
 178 // (high 32-bits are 0) so it could be used in 64-bits shifts.
 179 //
 180 //  Input:
 181 //    Rint  -  32-bits value
 182 //    Rtmp  -  scratch
 183 //
 184 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
 185 #ifdef ASSERT
 186   Label L;
 187   assert_different_registers(Rtmp, Rint);
 188   __ movslq(Rtmp, Rint);
 189   __ cmpq(Rtmp, Rint);
 190   __ jcc(Assembler::equal, L);
 191   __ stop("high 32-bits of int value are not 0");
 192   __ bind(L);
 193 #endif
 194 }
 195 
 196 
 197 //  Generate overlap test for array copy stubs
 198 //
 199 //  Input:
 200 //     c_rarg0 - from
 201 //     c_rarg1 - to
 202 //     c_rarg2 - element count
 203 //
 204 //  Output:
 205 //     rax   - &from[element count - 1]
 206 //
 207 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
 208   const Register from     = c_rarg0;
 209   const Register to       = c_rarg1;
 210   const Register count    = c_rarg2;
 211   const Register end_from = rax;
 212 
 213   __ cmpptr(to, from);
 214   __ lea(end_from, Address(from, count, sf, 0));
 215   if (NOLp == nullptr) {
 216     ExternalAddress no_overlap(no_overlap_target);
 217     __ jump_cc(Assembler::belowEqual, no_overlap);
 218     __ cmpptr(to, end_from);
 219     __ jump_cc(Assembler::aboveEqual, no_overlap);
 220   } else {
 221     __ jcc(Assembler::belowEqual, (*NOLp));
 222     __ cmpptr(to, end_from);
 223     __ jcc(Assembler::aboveEqual, (*NOLp));
 224   }
 225 }
 226 
 227 
 228 // Copy big chunks forward
 229 //
 230 // Inputs:
 231 //   end_from     - source arrays end address
 232 //   end_to       - destination array end address
 233 //   qword_count  - 64-bits element count, negative
 234 //   tmp1         - scratch
 235 //   L_copy_bytes - entry label
 236 //   L_copy_8_bytes  - exit  label
 237 //
 238 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
 239                                        Register qword_count, Register tmp1,
 240                                        Register tmp2, Label& L_copy_bytes,
 241                                        Label& L_copy_8_bytes, DecoratorSet decorators,
 242                                        BasicType type) {
 243   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 244   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 245   Label L_loop;
 246   __ align(OptoLoopAlignment);
 247   if (UseUnalignedLoadStores) {
 248     Label L_end;
 249     __ BIND(L_loop);
 250     if (UseAVX >= 2) {
 251       bs->copy_load_at(_masm, decorators, type, 32,
 252                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 253                        tmp1, xmm1);
 254       bs->copy_store_at(_masm, decorators, type, 32,
 255                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 256                         tmp1, tmp2, xmm1);
 257 
 258       bs->copy_load_at(_masm, decorators, type, 32,
 259                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 260                        tmp1, xmm1);
 261       bs->copy_store_at(_masm, decorators, type, 32,
 262                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 263                         tmp1, tmp2, xmm1);
 264     } else {
 265       bs->copy_load_at(_masm, decorators, type, 16,
 266                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 267                        tmp1, xmm1);
 268       bs->copy_store_at(_masm, decorators, type, 16,
 269                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 270                         tmp1, tmp2, xmm1);
 271       bs->copy_load_at(_masm, decorators, type, 16,
 272                        xmm0, Address(end_from, qword_count, Address::times_8, -40),
 273                        tmp1, xmm1);
 274       bs->copy_store_at(_masm, decorators, type, 16,
 275                         Address(end_to, qword_count, Address::times_8, -40), xmm0,
 276                         tmp1, tmp2, xmm1);
 277       bs->copy_load_at(_masm, decorators, type, 16,
 278                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 279                        tmp1, xmm1);
 280       bs->copy_store_at(_masm, decorators, type, 16,
 281                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 282                         tmp1, tmp2, xmm1);
 283       bs->copy_load_at(_masm, decorators, type, 16,
 284                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 285                        tmp1, xmm1);
 286       bs->copy_store_at(_masm, decorators, type, 16,
 287                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 288                         tmp1, tmp2, xmm1);
 289     }
 290 
 291     __ BIND(L_copy_bytes);
 292     __ addptr(qword_count, 8);
 293     __ jcc(Assembler::lessEqual, L_loop);
 294     __ subptr(qword_count, 4);  // sub(8) and add(4)
 295     __ jcc(Assembler::greater, L_end);
 296     // Copy trailing 32 bytes
 297     if (UseAVX >= 2) {
 298       bs->copy_load_at(_masm, decorators, type, 32,
 299                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 300                        tmp1, xmm1);
 301       bs->copy_store_at(_masm, decorators, type, 32,
 302                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 303                         tmp1, tmp2, xmm1);
 304     } else {
 305       bs->copy_load_at(_masm, decorators, type, 16,
 306                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 307                        tmp1, xmm1);
 308       bs->copy_store_at(_masm, decorators, type, 16,
 309                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 310                         tmp1, tmp2, xmm1);
 311       bs->copy_load_at(_masm, decorators, type, 16,
 312                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 313                        tmp1, xmm1);
 314       bs->copy_store_at(_masm, decorators, type, 16,
 315                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 316                         tmp1, tmp2, xmm1);
 317     }
 318     __ addptr(qword_count, 4);
 319     __ BIND(L_end);
 320   } else {
 321     // Copy 32-bytes per iteration
 322     __ BIND(L_loop);
 323     bs->copy_load_at(_masm, decorators, type, 8,
 324                      tmp1, Address(end_from, qword_count, Address::times_8, -24),
 325                      tmp2);
 326     bs->copy_store_at(_masm, decorators, type, 8,
 327                       Address(end_to, qword_count, Address::times_8, -24), tmp1,
 328                       tmp2);
 329     bs->copy_load_at(_masm, decorators, type, 8,
 330                      tmp1, Address(end_from, qword_count, Address::times_8, -16),
 331                      tmp2);
 332     bs->copy_store_at(_masm, decorators, type, 8,
 333                       Address(end_to, qword_count, Address::times_8, -16), tmp1,
 334                       tmp2);
 335     bs->copy_load_at(_masm, decorators, type, 8,
 336                      tmp1, Address(end_from, qword_count, Address::times_8, -8),
 337                      tmp2);
 338     bs->copy_store_at(_masm, decorators, type, 8,
 339                       Address(end_to, qword_count, Address::times_8, -8), tmp1,
 340                       tmp2);
 341     bs->copy_load_at(_masm, decorators, type, 8,
 342                      tmp1, Address(end_from, qword_count, Address::times_8, 0),
 343                      tmp2);
 344     bs->copy_store_at(_masm, decorators, type, 8,
 345                       Address(end_to, qword_count, Address::times_8, 0), tmp1,
 346                       tmp2);
 347 
 348     __ BIND(L_copy_bytes);
 349     __ addptr(qword_count, 4);
 350     __ jcc(Assembler::lessEqual, L_loop);
 351   }
 352   __ subptr(qword_count, 4);
 353   __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
 354 }
 355 
 356 
 357 // Copy big chunks backward
 358 //
 359 // Inputs:
 360 //   from         - source arrays address
 361 //   dest         - destination array address
 362 //   qword_count  - 64-bits element count
 363 //   tmp1         - scratch
 364 //   L_copy_bytes - entry label
 365 //   L_copy_8_bytes  - exit  label
 366 //
 367 void StubGenerator::copy_bytes_backward(Register from, Register dest,
 368                                         Register qword_count, Register tmp1,
 369                                         Register tmp2, Label& L_copy_bytes,
 370                                         Label& L_copy_8_bytes, DecoratorSet decorators,
 371                                         BasicType type) {
 372   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 373   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 374   Label L_loop;
 375   __ align(OptoLoopAlignment);
 376   if (UseUnalignedLoadStores) {
 377     Label L_end;
 378     __ BIND(L_loop);
 379     if (UseAVX >= 2) {
 380       bs->copy_load_at(_masm, decorators, type, 32,
 381                        xmm0, Address(from, qword_count, Address::times_8, 32),
 382                        tmp1, xmm1);
 383       bs->copy_store_at(_masm, decorators, type, 32,
 384                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 385                         tmp1, tmp2, xmm1);
 386       bs->copy_load_at(_masm, decorators, type, 32,
 387                        xmm0, Address(from, qword_count, Address::times_8, 0),
 388                        tmp1, xmm1);
 389       bs->copy_store_at(_masm, decorators, type, 32,
 390                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 391                         tmp1, tmp2, xmm1);
 392     } else {
 393       bs->copy_load_at(_masm, decorators, type, 16,
 394                        xmm0, Address(from, qword_count, Address::times_8, 48),
 395                        tmp1, xmm1);
 396       bs->copy_store_at(_masm, decorators, type, 16,
 397                         Address(dest, qword_count, Address::times_8, 48), xmm0,
 398                         tmp1, tmp2, xmm1);
 399       bs->copy_load_at(_masm, decorators, type, 16,
 400                        xmm0, Address(from, qword_count, Address::times_8, 32),
 401                        tmp1, xmm1);
 402       bs->copy_store_at(_masm, decorators, type, 16,
 403                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 404                         tmp1, tmp2, xmm1);
 405       bs->copy_load_at(_masm, decorators, type, 16,
 406                        xmm0, Address(from, qword_count, Address::times_8, 16),
 407                        tmp1, xmm1);
 408       bs->copy_store_at(_masm, decorators, type, 16,
 409                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 410                         tmp1, tmp2, xmm1);
 411       bs->copy_load_at(_masm, decorators, type, 16,
 412                        xmm0, Address(from, qword_count, Address::times_8, 0),
 413                        tmp1, xmm1);
 414       bs->copy_store_at(_masm, decorators, type, 16,
 415                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 416                         tmp1, tmp2, xmm1);
 417     }
 418 
 419     __ BIND(L_copy_bytes);
 420     __ subptr(qword_count, 8);
 421     __ jcc(Assembler::greaterEqual, L_loop);
 422 
 423     __ addptr(qword_count, 4);  // add(8) and sub(4)
 424     __ jcc(Assembler::less, L_end);
 425     // Copy trailing 32 bytes
 426     if (UseAVX >= 2) {
 427       bs->copy_load_at(_masm, decorators, type, 32,
 428                        xmm0, Address(from, qword_count, Address::times_8, 0),
 429                        tmp1, xmm1);
 430       bs->copy_store_at(_masm, decorators, type, 32,
 431                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 432                         tmp1, tmp2, xmm1);
 433     } else {
 434       bs->copy_load_at(_masm, decorators, type, 16,
 435                        xmm0, Address(from, qword_count, Address::times_8, 16),
 436                        tmp1, xmm1);
 437       bs->copy_store_at(_masm, decorators, type, 16,
 438                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 439                         tmp1, tmp2, xmm1);
 440       bs->copy_load_at(_masm, decorators, type, 16,
 441                        xmm0, Address(from, qword_count, Address::times_8, 0),
 442                        tmp1, xmm1);
 443       bs->copy_store_at(_masm, decorators, type, 16,
 444                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 445                         tmp1, tmp2, xmm1);
 446     }
 447     __ subptr(qword_count, 4);
 448     __ BIND(L_end);
 449   } else {
 450     // Copy 32-bytes per iteration
 451     __ BIND(L_loop);
 452     bs->copy_load_at(_masm, decorators, type, 8,
 453                      tmp1, Address(from, qword_count, Address::times_8, 24),
 454                      tmp2);
 455     bs->copy_store_at(_masm, decorators, type, 8,
 456                       Address(dest, qword_count, Address::times_8, 24), tmp1,
 457                       tmp2);
 458     bs->copy_load_at(_masm, decorators, type, 8,
 459                      tmp1, Address(from, qword_count, Address::times_8, 16),
 460                      tmp2);
 461     bs->copy_store_at(_masm, decorators, type, 8,
 462                       Address(dest, qword_count, Address::times_8, 16), tmp1,
 463                       tmp2);
 464     bs->copy_load_at(_masm, decorators, type, 8,
 465                      tmp1, Address(from, qword_count, Address::times_8, 8),
 466                      tmp2);
 467     bs->copy_store_at(_masm, decorators, type, 8,
 468                       Address(dest, qword_count, Address::times_8, 8), tmp1,
 469                       tmp2);
 470     bs->copy_load_at(_masm, decorators, type, 8,
 471                      tmp1, Address(from, qword_count, Address::times_8, 0),
 472                      tmp2);
 473     bs->copy_store_at(_masm, decorators, type, 8,
 474                       Address(dest, qword_count, Address::times_8, 0), tmp1,
 475                       tmp2);
 476 
 477     __ BIND(L_copy_bytes);
 478     __ subptr(qword_count, 4);
 479     __ jcc(Assembler::greaterEqual, L_loop);
 480   }
 481   __ addptr(qword_count, 4);
 482   __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
 483 }
 484 
 485 #if COMPILER2_OR_JVMCI
 486 
 487 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
 488 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 489 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 490 //   default configuration.
 491 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 492 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 493 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 494 //   better performance for disjoint copies. For conjoint/backward copy vector based
 495 //   copy performs better.
 496 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
 497 //   64 byte vector registers (ZMMs).
 498 
 499 // Inputs:
 500 //   c_rarg0   - source array address
 501 //   c_rarg1   - destination array address
 502 //   c_rarg2   - element count, treated as ssize_t, can be zero
 503 //
 504 //
 505 // Side Effects:
 506 //   disjoint_copy_avx3_masked is set to the no-overlap entry point
 507 //   used by generate_conjoint_[byte/int/short/long]_copy().
 508 //
 509 address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const char *name,
 510                                                           int shift, bool aligned, bool is_oop,
 511                                                           bool dest_uninitialized) {
 512   __ align(CodeEntryAlignment);
 513   StubCodeMark mark(this, "StubRoutines", name);
 514   address start = __ pc();
 515 
 516   int avx3threshold = VM_Version::avx3_threshold();
 517   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 518   const int large_threshold = 2621440; // 2.5 MB
 519   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 520   Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 521   Label L_copy_large, L_finish;
 522   const Register from        = rdi;  // source array address
 523   const Register to          = rsi;  // destination array address
 524   const Register count       = rdx;  // elements count
 525   const Register temp1       = r8;
 526   const Register temp2       = r11;
 527   const Register temp3       = rax;
 528   const Register temp4       = rcx;
 529   // End pointers are inclusive, and if count is not zero they point
 530   // to the last unit copied:  end_to[0] := end_from[0]
 531 
 532   __ enter(); // required for proper stackwalking of RuntimeStub frame
 533   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 534 
 535   if (entry != nullptr) {
 536     *entry = __ pc();
 537      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 538     BLOCK_COMMENT("Entry:");
 539   }
 540 
 541   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 542   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 543 
 544   setup_argument_regs(type);
 545 
 546   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 547   if (dest_uninitialized) {
 548     decorators |= IS_DEST_UNINITIALIZED;
 549   }
 550   if (aligned) {
 551     decorators |= ARRAYCOPY_ALIGNED;
 552   }
 553   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 554   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 555 
 556   {
 557     // Type(shift)           byte(0), short(1), int(2),   long(3)
 558     int loop_size[]        = { 192,     96,       48,      24};
 559     int threshold[]        = { 4096,    2048,     1024,    512};
 560 
 561     // UnsafeCopyMemory page error: continue after ucm
 562     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
 563     // 'from', 'to' and 'count' are now valid
 564 
 565     // temp1 holds remaining count and temp4 holds running count used to compute
 566     // next address offset for start of to/from addresses (temp4 * scale).
 567     __ mov64(temp4, 0);
 568     __ movq(temp1, count);
 569 
 570     // Zero length check.
 571     __ BIND(L_tail);
 572     __ cmpq(temp1, 0);
 573     __ jcc(Assembler::lessEqual, L_exit);
 574 
 575     // Special cases using 32 byte [masked] vector copy operations.
 576     arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 577                                  temp4, temp3, use64byteVector, L_entry, L_exit);
 578 
 579     // PRE-MAIN-POST loop for aligned copy.
 580     __ BIND(L_entry);
 581 
 582     if (MaxVectorSize == 64) {
 583       __ movq(temp2, temp1);
 584       __ shlq(temp2, shift);
 585       __ cmpq(temp2, large_threshold);
 586       __ jcc(Assembler::greaterEqual, L_copy_large);
 587     }
 588     if (avx3threshold != 0) {
 589       __ cmpq(count, threshold[shift]);
 590       if (MaxVectorSize == 64) {
 591         // Copy using 64 byte vectors.
 592         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 593       } else {
 594         assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
 595         // REP MOVS offer a faster copy path.
 596         __ jcc(Assembler::greaterEqual, L_repmovs);
 597       }
 598     }
 599 
 600     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 601       // Partial copy to make dst address 32 byte aligned.
 602       __ movq(temp2, to);
 603       __ andq(temp2, 31);
 604       __ jcc(Assembler::equal, L_main_pre_loop);
 605 
 606       __ negptr(temp2);
 607       __ addq(temp2, 32);
 608       if (shift) {
 609         __ shrq(temp2, shift);
 610       }
 611       __ movq(temp3, temp2);
 612       copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
 613       __ movq(temp4, temp2);
 614       __ movq(temp1, count);
 615       __ subq(temp1, temp2);
 616 
 617       __ cmpq(temp1, loop_size[shift]);
 618       __ jcc(Assembler::less, L_tail);
 619 
 620       __ BIND(L_main_pre_loop);
 621       __ subq(temp1, loop_size[shift]);
 622 
 623       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 624       __ align32();
 625       __ BIND(L_main_loop);
 626          copy64_avx(to, from, temp4, xmm1, false, shift, 0);
 627          copy64_avx(to, from, temp4, xmm1, false, shift, 64);
 628          copy64_avx(to, from, temp4, xmm1, false, shift, 128);
 629          __ addptr(temp4, loop_size[shift]);
 630          __ subq(temp1, loop_size[shift]);
 631          __ jcc(Assembler::greater, L_main_loop);
 632 
 633       __ addq(temp1, loop_size[shift]);
 634 
 635       // Tail loop.
 636       __ jmp(L_tail);
 637 
 638       __ BIND(L_repmovs);
 639         __ movq(temp2, temp1);
 640         // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
 641         __ movq(temp3, to);
 642         __ movq(to,  from);
 643         __ movq(from, temp3);
 644         // Save to/from for restoration post rep_mov.
 645         __ movq(temp1, to);
 646         __ movq(temp3, from);
 647         if(shift < 3) {
 648           __ shrq(temp2, 3-shift);     // quad word count
 649         }
 650         __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
 651         __ rep_mov();
 652         __ shlq(temp2, 3);             // convert quad words into byte count.
 653         if(shift) {
 654           __ shrq(temp2, shift);       // type specific count.
 655         }
 656         // Restore original addresses in to/from.
 657         __ movq(to, temp3);
 658         __ movq(from, temp1);
 659         __ movq(temp4, temp2);
 660         __ movq(temp1, count);
 661         __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
 662         __ jmp(L_tail);
 663     }
 664 
 665     if (MaxVectorSize > 32) {
 666       __ BIND(L_pre_main_post_64);
 667       // Partial copy to make dst address 64 byte aligned.
 668       __ movq(temp2, to);
 669       __ andq(temp2, 63);
 670       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 671 
 672       __ negptr(temp2);
 673       __ addq(temp2, 64);
 674       if (shift) {
 675         __ shrq(temp2, shift);
 676       }
 677       __ movq(temp3, temp2);
 678       copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
 679       __ movq(temp4, temp2);
 680       __ movq(temp1, count);
 681       __ subq(temp1, temp2);
 682 
 683       __ cmpq(temp1, loop_size[shift]);
 684       __ jcc(Assembler::less, L_tail64);
 685 
 686       __ BIND(L_main_pre_loop_64bytes);
 687       __ subq(temp1, loop_size[shift]);
 688 
 689       // Main loop with aligned copy block size of 192 bytes at
 690       // 64 byte copy granularity.
 691       __ align32();
 692       __ BIND(L_main_loop_64bytes);
 693          copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
 694          copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
 695          copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
 696          __ addptr(temp4, loop_size[shift]);
 697          __ subq(temp1, loop_size[shift]);
 698          __ jcc(Assembler::greater, L_main_loop_64bytes);
 699 
 700       __ addq(temp1, loop_size[shift]);
 701       // Zero length check.
 702       __ jcc(Assembler::lessEqual, L_exit);
 703 
 704       __ BIND(L_tail64);
 705 
 706       // Tail handling using 64 byte [masked] vector copy operations.
 707       use64byteVector = true;
 708       arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 709                                    temp4, temp3, use64byteVector, L_entry, L_exit);
 710     }
 711     __ BIND(L_exit);
 712   }
 713 
 714   __ BIND(L_finish);
 715   address ucme_exit_pc = __ pc();
 716   // When called from generic_arraycopy r11 contains specific values
 717   // used during arraycopy epilogue, re-initializing r11.
 718   if (is_oop) {
 719     __ movq(r11, shift == 3 ? count : to);
 720   }
 721   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 722   restore_argument_regs(type);
 723   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 724   __ xorptr(rax, rax); // return 0
 725   __ vzeroupper();
 726   __ leave(); // required for proper stackwalking of RuntimeStub frame
 727   __ ret(0);
 728 
 729   if (MaxVectorSize == 64) {
 730     __ BIND(L_copy_large);
 731     arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
 732     __ jmp(L_finish);
 733   }
 734   return start;
 735 }
 736 
 737 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
 738                                          Register temp3, Register temp4, Register count,
 739                                          XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
 740                                          XMMRegister xmm4, int shift) {
 741 
 742   // Type(shift)           byte(0), short(1), int(2),   long(3)
 743   int loop_size[]        = { 256,     128,       64,      32};
 744   int threshold[]        = { 4096,    2048,     1024,    512};
 745 
 746   Label L_main_loop_large;
 747   Label L_tail_large;
 748   Label L_exit_large;
 749   Label L_entry_large;
 750   Label L_main_pre_loop_large;
 751   Label L_pre_main_post_large;
 752 
 753   assert(MaxVectorSize == 64, "vector length != 64");
 754   __ BIND(L_entry_large);
 755 
 756   __ BIND(L_pre_main_post_large);
 757   // Partial copy to make dst address 64 byte aligned.
 758   __ movq(temp2, to);
 759   __ andq(temp2, 63);
 760   __ jcc(Assembler::equal, L_main_pre_loop_large);
 761 
 762   __ negptr(temp2);
 763   __ addq(temp2, 64);
 764   if (shift) {
 765     __ shrq(temp2, shift);
 766   }
 767   __ movq(temp3, temp2);
 768   copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
 769   __ movq(temp4, temp2);
 770   __ movq(temp1, count);
 771   __ subq(temp1, temp2);
 772 
 773   __ cmpq(temp1, loop_size[shift]);
 774   __ jcc(Assembler::less, L_tail_large);
 775 
 776   __ BIND(L_main_pre_loop_large);
 777   __ subq(temp1, loop_size[shift]);
 778 
 779   // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
 780   __ align32();
 781   __ BIND(L_main_loop_large);
 782   copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
 783   __ addptr(temp4, loop_size[shift]);
 784   __ subq(temp1, loop_size[shift]);
 785   __ jcc(Assembler::greater, L_main_loop_large);
 786   // fence needed because copy256_avx3 uses non-temporal stores
 787   __ sfence();
 788 
 789   __ addq(temp1, loop_size[shift]);
 790   // Zero length check.
 791   __ jcc(Assembler::lessEqual, L_exit_large);
 792   __ BIND(L_tail_large);
 793   // Tail handling using 64 byte [masked] vector copy operations.
 794   __ cmpq(temp1, 0);
 795   __ jcc(Assembler::lessEqual, L_exit_large);
 796   arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
 797                                temp4, temp3, L_exit_large);
 798   __ BIND(L_exit_large);
 799 }
 800 
 801 // Inputs:
 802 //   c_rarg0   - source array address
 803 //   c_rarg1   - destination array address
 804 //   c_rarg2   - element count, treated as ssize_t, can be zero
 805 //
 806 //
 807 address StubGenerator::generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
 808                                                           address nooverlap_target, bool aligned,
 809                                                           bool is_oop, bool dest_uninitialized) {
 810   __ align(CodeEntryAlignment);
 811   StubCodeMark mark(this, "StubRoutines", name);
 812   address start = __ pc();
 813 
 814   int avx3threshold = VM_Version::avx3_threshold();
 815   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 816 
 817   Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 818   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 819   const Register from        = rdi;  // source array address
 820   const Register to          = rsi;  // destination array address
 821   const Register count       = rdx;  // elements count
 822   const Register temp1       = r8;
 823   const Register temp2       = rcx;
 824   const Register temp3       = r11;
 825   const Register temp4       = rax;
 826   // End pointers are inclusive, and if count is not zero they point
 827   // to the last unit copied:  end_to[0] := end_from[0]
 828 
 829   __ enter(); // required for proper stackwalking of RuntimeStub frame
 830   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 831 
 832   if (entry != nullptr) {
 833     *entry = __ pc();
 834      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 835     BLOCK_COMMENT("Entry:");
 836   }
 837 
 838   array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
 839 
 840   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 841   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 842 
 843   setup_argument_regs(type);
 844 
 845   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 846   if (dest_uninitialized) {
 847     decorators |= IS_DEST_UNINITIALIZED;
 848   }
 849   if (aligned) {
 850     decorators |= ARRAYCOPY_ALIGNED;
 851   }
 852   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 853   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 854   {
 855     // Type(shift)       byte(0), short(1), int(2),   long(3)
 856     int loop_size[]   = { 192,     96,       48,      24};
 857     int threshold[]   = { 4096,    2048,     1024,    512};
 858 
 859     // UnsafeCopyMemory page error: continue after ucm
 860     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
 861     // 'from', 'to' and 'count' are now valid
 862 
 863     // temp1 holds remaining count.
 864     __ movq(temp1, count);
 865 
 866     // Zero length check.
 867     __ BIND(L_tail);
 868     __ cmpq(temp1, 0);
 869     __ jcc(Assembler::lessEqual, L_exit);
 870 
 871     __ mov64(temp2, 0);
 872     __ movq(temp3, temp1);
 873     // Special cases using 32 byte [masked] vector copy operations.
 874     arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 875                                           temp4, use64byteVector, L_entry, L_exit);
 876 
 877     // PRE-MAIN-POST loop for aligned copy.
 878     __ BIND(L_entry);
 879 
 880     if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
 881       __ cmpq(temp1, threshold[shift]);
 882       __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 883     }
 884 
 885     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 886       // Partial copy to make dst address 32 byte aligned.
 887       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 888       __ andq(temp2, 31);
 889       __ jcc(Assembler::equal, L_main_pre_loop);
 890 
 891       if (shift) {
 892         __ shrq(temp2, shift);
 893       }
 894       __ subq(temp1, temp2);
 895       copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
 896 
 897       __ cmpq(temp1, loop_size[shift]);
 898       __ jcc(Assembler::less, L_tail);
 899 
 900       __ BIND(L_main_pre_loop);
 901 
 902       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 903       __ align32();
 904       __ BIND(L_main_loop);
 905          copy64_avx(to, from, temp1, xmm1, true, shift, -64);
 906          copy64_avx(to, from, temp1, xmm1, true, shift, -128);
 907          copy64_avx(to, from, temp1, xmm1, true, shift, -192);
 908          __ subptr(temp1, loop_size[shift]);
 909          __ cmpq(temp1, loop_size[shift]);
 910          __ jcc(Assembler::greater, L_main_loop);
 911 
 912       // Tail loop.
 913       __ jmp(L_tail);
 914     }
 915 
 916     if (MaxVectorSize > 32) {
 917       __ BIND(L_pre_main_post_64);
 918       // Partial copy to make dst address 64 byte aligned.
 919       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 920       __ andq(temp2, 63);
 921       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 922 
 923       if (shift) {
 924         __ shrq(temp2, shift);
 925       }
 926       __ subq(temp1, temp2);
 927       copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
 928 
 929       __ cmpq(temp1, loop_size[shift]);
 930       __ jcc(Assembler::less, L_tail64);
 931 
 932       __ BIND(L_main_pre_loop_64bytes);
 933 
 934       // Main loop with aligned copy block size of 192 bytes at
 935       // 64 byte copy granularity.
 936       __ align32();
 937       __ BIND(L_main_loop_64bytes);
 938          copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
 939          copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
 940          copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
 941          __ subq(temp1, loop_size[shift]);
 942          __ cmpq(temp1, loop_size[shift]);
 943          __ jcc(Assembler::greater, L_main_loop_64bytes);
 944 
 945       // Zero length check.
 946       __ cmpq(temp1, 0);
 947       __ jcc(Assembler::lessEqual, L_exit);
 948 
 949       __ BIND(L_tail64);
 950 
 951       // Tail handling using 64 byte [masked] vector copy operations.
 952       use64byteVector = true;
 953       __ mov64(temp2, 0);
 954       __ movq(temp3, temp1);
 955       arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 956                                             temp4, use64byteVector, L_entry, L_exit);
 957     }
 958     __ BIND(L_exit);
 959   }
 960   address ucme_exit_pc = __ pc();
 961   // When called from generic_arraycopy r11 contains specific values
 962   // used during arraycopy epilogue, re-initializing r11.
 963   if(is_oop) {
 964     __ movq(r11, count);
 965   }
 966   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 967   restore_argument_regs(type);
 968   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 969   __ xorptr(rax, rax); // return 0
 970   __ vzeroupper();
 971   __ leave(); // required for proper stackwalking of RuntimeStub frame
 972   __ ret(0);
 973 
 974   return start;
 975 }
 976 
 977 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
 978                                                  Register to, Register count, int shift,
 979                                                  Register index, Register temp,
 980                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
 981   Label L_entry_64, L_entry_96, L_entry_128;
 982   Label L_entry_160, L_entry_192;
 983 
 984   int size_mat[][6] = {
 985   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
 986   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
 987   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
 988   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
 989   };
 990 
 991   // Case A) Special case for length less than equal to 32 bytes.
 992   __ cmpq(count, size_mat[shift][0]);
 993   __ jccb(Assembler::greater, L_entry_64);
 994   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
 995   __ jmp(L_exit);
 996 
 997   // Case B) Special case for length less than equal to 64 bytes.
 998   __ BIND(L_entry_64);
 999   __ cmpq(count, size_mat[shift][1]);
1000   __ jccb(Assembler::greater, L_entry_96);
1001   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1002   __ jmp(L_exit);
1003 
1004   // Case C) Special case for length less than equal to 96 bytes.
1005   __ BIND(L_entry_96);
1006   __ cmpq(count, size_mat[shift][2]);
1007   __ jccb(Assembler::greater, L_entry_128);
1008   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1009   __ subq(count, 64 >> shift);
1010   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1011   __ jmp(L_exit);
1012 
1013   // Case D) Special case for length less than equal to 128 bytes.
1014   __ BIND(L_entry_128);
1015   __ cmpq(count, size_mat[shift][3]);
1016   __ jccb(Assembler::greater, L_entry_160);
1017   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1018   copy32_avx(to, from, index, xmm, shift, 64);
1019   __ subq(count, 96 >> shift);
1020   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1021   __ jmp(L_exit);
1022 
1023   // Case E) Special case for length less than equal to 160 bytes.
1024   __ BIND(L_entry_160);
1025   __ cmpq(count, size_mat[shift][4]);
1026   __ jccb(Assembler::greater, L_entry_192);
1027   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1028   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1029   __ subq(count, 128 >> shift);
1030   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1031   __ jmp(L_exit);
1032 
1033   // Case F) Special case for length less than equal to 192 bytes.
1034   __ BIND(L_entry_192);
1035   __ cmpq(count, size_mat[shift][5]);
1036   __ jcc(Assembler::greater, L_entry);
1037   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1038   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1039   copy32_avx(to, from, index, xmm, shift, 128);
1040   __ subq(count, 160 >> shift);
1041   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1042   __ jmp(L_exit);
1043 }
1044 
1045 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1046                                                      Register to, Register count, int shift, Register index,
1047                                                      Register temp, Label& L_exit) {
1048   Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1049 
1050   int size_mat[][4] = {
1051   /* T_BYTE */ {64, 128, 192, 256},
1052   /* T_SHORT*/ {32, 64 , 96 , 128},
1053   /* T_INT  */ {16, 32 , 48 ,  64},
1054   /* T_LONG */ { 8, 16 , 24 ,  32}
1055   };
1056 
1057   assert(MaxVectorSize == 64, "vector length != 64");
1058   // Case A) Special case for length less than or equal to 64 bytes.
1059   __ BIND(L_entry_64);
1060   __ cmpq(count, size_mat[shift][0]);
1061   __ jccb(Assembler::greater, L_entry_128);
1062   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1063   __ jmp(L_exit);
1064 
1065   // Case B) Special case for length less than or equal to 128 bytes.
1066   __ BIND(L_entry_128);
1067   __ cmpq(count, size_mat[shift][1]);
1068   __ jccb(Assembler::greater, L_entry_192);
1069   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1070   __ subq(count, 64 >> shift);
1071   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1072   __ jmp(L_exit);
1073 
1074   // Case C) Special case for length less than or equal to 192 bytes.
1075   __ BIND(L_entry_192);
1076   __ cmpq(count, size_mat[shift][2]);
1077   __ jcc(Assembler::greater, L_entry_256);
1078   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1079   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1080   __ subq(count, 128 >> shift);
1081   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1082   __ jmp(L_exit);
1083 
1084   // Case D) Special case for length less than or equal to 256 bytes.
1085   __ BIND(L_entry_256);
1086   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1087   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1088   copy64_avx(to, from, index, xmm, false, shift, 128, true);
1089   __ subq(count, 192 >> shift);
1090   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1091   __ jmp(L_exit);
1092 }
1093 
1094 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1095                                                            Register to, Register start_index, Register end_index,
1096                                                            Register count, int shift, Register temp,
1097                                                            bool use64byteVector, Label& L_entry, Label& L_exit) {
1098   Label L_entry_64, L_entry_96, L_entry_128;
1099   Label L_entry_160, L_entry_192;
1100   bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
1101 
1102   int size_mat[][6] = {
1103   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1104   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1105   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1106   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1107   };
1108 
1109   // Case A) Special case for length less than equal to 32 bytes.
1110   __ cmpq(count, size_mat[shift][0]);
1111   __ jccb(Assembler::greater, L_entry_64);
1112   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1113   __ jmp(L_exit);
1114 
1115   // Case B) Special case for length less than equal to 64 bytes.
1116   __ BIND(L_entry_64);
1117   __ cmpq(count, size_mat[shift][1]);
1118   __ jccb(Assembler::greater, L_entry_96);
1119   if (avx3) {
1120      copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1121   } else {
1122      copy32_avx(to, from, end_index, xmm, shift, -32);
1123      __ subq(count, 32 >> shift);
1124      copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1125   }
1126   __ jmp(L_exit);
1127 
1128   // Case C) Special case for length less than equal to 96 bytes.
1129   __ BIND(L_entry_96);
1130   __ cmpq(count, size_mat[shift][2]);
1131   __ jccb(Assembler::greater, L_entry_128);
1132   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1133   __ subq(count, 64 >> shift);
1134   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1135   __ jmp(L_exit);
1136 
1137   // Case D) Special case for length less than equal to 128 bytes.
1138   __ BIND(L_entry_128);
1139   __ cmpq(count, size_mat[shift][3]);
1140   __ jccb(Assembler::greater, L_entry_160);
1141   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1142   copy32_avx(to, from, end_index, xmm, shift, -96);
1143   __ subq(count, 96 >> shift);
1144   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1145   __ jmp(L_exit);
1146 
1147   // Case E) Special case for length less than equal to 160 bytes.
1148   __ BIND(L_entry_160);
1149   __ cmpq(count, size_mat[shift][4]);
1150   __ jccb(Assembler::greater, L_entry_192);
1151   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1152   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1153   __ subq(count, 128 >> shift);
1154   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1155   __ jmp(L_exit);
1156 
1157   // Case F) Special case for length less than equal to 192 bytes.
1158   __ BIND(L_entry_192);
1159   __ cmpq(count, size_mat[shift][5]);
1160   __ jcc(Assembler::greater, L_entry);
1161   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1162   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1163   copy32_avx(to, from, end_index, xmm, shift, -160);
1164   __ subq(count, 160 >> shift);
1165   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1166   __ jmp(L_exit);
1167 }
1168 
1169 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1170                                 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1171                                 int shift, int offset) {
1172   if (MaxVectorSize == 64) {
1173     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1174     __ prefetcht0(Address(src, index, scale, offset + 0x200));
1175     __ prefetcht0(Address(src, index, scale, offset + 0x240));
1176     __ prefetcht0(Address(src, index, scale, offset + 0x280));
1177     __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1178 
1179     __ prefetcht0(Address(src, index, scale, offset + 0x400));
1180     __ prefetcht0(Address(src, index, scale, offset + 0x440));
1181     __ prefetcht0(Address(src, index, scale, offset + 0x480));
1182     __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1183 
1184     __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1185     __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1186     __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1187     __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1188 
1189     __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1190     __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1191     __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1192     __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1193   }
1194 }
1195 
1196 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1197                                        KRegister mask, Register length, Register index,
1198                                        Register temp, int shift, int offset,
1199                                        bool use64byteVector) {
1200   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1201   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1202   if (!use64byteVector) {
1203     copy32_avx(dst, src, index, xmm, shift, offset);
1204     __ subptr(length, 32 >> shift);
1205     copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1206   } else {
1207     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1208     assert(MaxVectorSize == 64, "vector length != 64");
1209     __ mov64(temp, -1L);
1210     __ bzhiq(temp, temp, length);
1211     __ kmovql(mask, temp);
1212     __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1213     __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1214   }
1215 }
1216 
1217 
1218 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1219                                        KRegister mask, Register length, Register index,
1220                                        Register temp, int shift, int offset) {
1221   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1222   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1223   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1224   __ mov64(temp, -1L);
1225   __ bzhiq(temp, temp, length);
1226   __ kmovql(mask, temp);
1227   __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1228   __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1229 }
1230 
1231 
1232 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1233                                 int shift, int offset) {
1234   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1235   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1236   __ vmovdqu(xmm, Address(src, index, scale, offset));
1237   __ vmovdqu(Address(dst, index, scale, offset), xmm);
1238 }
1239 
1240 
1241 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1242                                 bool conjoint, int shift, int offset, bool use64byteVector) {
1243   assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1244   if (!use64byteVector) {
1245     if (conjoint) {
1246       copy32_avx(dst, src, index, xmm, shift, offset+32);
1247       copy32_avx(dst, src, index, xmm, shift, offset);
1248     } else {
1249       copy32_avx(dst, src, index, xmm, shift, offset);
1250       copy32_avx(dst, src, index, xmm, shift, offset+32);
1251     }
1252   } else {
1253     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1254     __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1255     __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1256   }
1257 }
1258 
1259 #endif // COMPILER2_OR_JVMCI
1260 
1261 
1262 // Arguments:
1263 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1264 //             ignored
1265 //   name    - stub name string
1266 //
1267 // Inputs:
1268 //   c_rarg0   - source array address
1269 //   c_rarg1   - destination array address
1270 //   c_rarg2   - element count, treated as ssize_t, can be zero
1271 //
1272 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1273 // we let the hardware handle it.  The one to eight bytes within words,
1274 // dwords or qwords that span cache line boundaries will still be loaded
1275 // and stored atomically.
1276 //
1277 // Side Effects:
1278 //   disjoint_byte_copy_entry is set to the no-overlap entry point
1279 //   used by generate_conjoint_byte_copy().
1280 //
1281 address StubGenerator::generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1282 #if COMPILER2_OR_JVMCI
1283   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1284      return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1285                                                aligned, false, false);
1286   }
1287 #endif
1288   __ align(CodeEntryAlignment);
1289   StubCodeMark mark(this, "StubRoutines", name);
1290   address start = __ pc();
1291   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1292 
1293   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1294   Label L_copy_byte, L_exit;
1295   const Register from        = rdi;  // source array address
1296   const Register to          = rsi;  // destination array address
1297   const Register count       = rdx;  // elements count
1298   const Register byte_count  = rcx;
1299   const Register qword_count = count;
1300   const Register end_from    = from; // source array end address
1301   const Register end_to      = to;   // destination array end address
1302   // End pointers are inclusive, and if count is not zero they point
1303   // to the last unit copied:  end_to[0] := end_from[0]
1304 
1305   __ enter(); // required for proper stackwalking of RuntimeStub frame
1306   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1307 
1308   if (entry != nullptr) {
1309     *entry = __ pc();
1310      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1311     BLOCK_COMMENT("Entry:");
1312   }
1313 
1314   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1315                     // r9 and r10 may be used to save non-volatile registers
1316 
1317   {
1318     // UnsafeCopyMemory page error: continue after ucm
1319     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1320     // 'from', 'to' and 'count' are now valid
1321     __ movptr(byte_count, count);
1322     __ shrptr(count, 3); // count => qword_count
1323 
1324     // Copy from low to high addresses.  Use 'to' as scratch.
1325     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1326     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1327     __ negptr(qword_count); // make the count negative
1328     __ jmp(L_copy_bytes);
1329 
1330     // Copy trailing qwords
1331   __ BIND(L_copy_8_bytes);
1332     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1333     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1334     __ increment(qword_count);
1335     __ jcc(Assembler::notZero, L_copy_8_bytes);
1336 
1337     // Check for and copy trailing dword
1338   __ BIND(L_copy_4_bytes);
1339     __ testl(byte_count, 4);
1340     __ jccb(Assembler::zero, L_copy_2_bytes);
1341     __ movl(rax, Address(end_from, 8));
1342     __ movl(Address(end_to, 8), rax);
1343 
1344     __ addptr(end_from, 4);
1345     __ addptr(end_to, 4);
1346 
1347     // Check for and copy trailing word
1348   __ BIND(L_copy_2_bytes);
1349     __ testl(byte_count, 2);
1350     __ jccb(Assembler::zero, L_copy_byte);
1351     __ movw(rax, Address(end_from, 8));
1352     __ movw(Address(end_to, 8), rax);
1353 
1354     __ addptr(end_from, 2);
1355     __ addptr(end_to, 2);
1356 
1357     // Check for and copy trailing byte
1358   __ BIND(L_copy_byte);
1359     __ testl(byte_count, 1);
1360     __ jccb(Assembler::zero, L_exit);
1361     __ movb(rax, Address(end_from, 8));
1362     __ movb(Address(end_to, 8), rax);
1363   }
1364 __ BIND(L_exit);
1365   address ucme_exit_pc = __ pc();
1366   restore_arg_regs();
1367   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1368   __ xorptr(rax, rax); // return 0
1369   __ vzeroupper();
1370   __ leave(); // required for proper stackwalking of RuntimeStub frame
1371   __ ret(0);
1372 
1373   {
1374     UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1375     // Copy in multi-bytes chunks
1376     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1377     __ jmp(L_copy_4_bytes);
1378   }
1379   return start;
1380 }
1381 
1382 
1383 // Arguments:
1384 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1385 //             ignored
1386 //   name    - stub name string
1387 //
1388 // Inputs:
1389 //   c_rarg0   - source array address
1390 //   c_rarg1   - destination array address
1391 //   c_rarg2   - element count, treated as ssize_t, can be zero
1392 //
1393 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1394 // we let the hardware handle it.  The one to eight bytes within words,
1395 // dwords or qwords that span cache line boundaries will still be loaded
1396 // and stored atomically.
1397 //
1398 address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1399                                                    address* entry, const char *name) {
1400 #if COMPILER2_OR_JVMCI
1401   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1402      return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1403                                                nooverlap_target, aligned, false, false);
1404   }
1405 #endif
1406   __ align(CodeEntryAlignment);
1407   StubCodeMark mark(this, "StubRoutines", name);
1408   address start = __ pc();
1409   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1410 
1411   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1412   const Register from        = rdi;  // source array address
1413   const Register to          = rsi;  // destination array address
1414   const Register count       = rdx;  // elements count
1415   const Register byte_count  = rcx;
1416   const Register qword_count = count;
1417 
1418   __ enter(); // required for proper stackwalking of RuntimeStub frame
1419   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1420 
1421   if (entry != nullptr) {
1422     *entry = __ pc();
1423     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1424     BLOCK_COMMENT("Entry:");
1425   }
1426 
1427   array_overlap_test(nooverlap_target, Address::times_1);
1428   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1429                     // r9 and r10 may be used to save non-volatile registers
1430 
1431   {
1432     // UnsafeCopyMemory page error: continue after ucm
1433     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1434     // 'from', 'to' and 'count' are now valid
1435     __ movptr(byte_count, count);
1436     __ shrptr(count, 3);   // count => qword_count
1437 
1438     // Copy from high to low addresses.
1439 
1440     // Check for and copy trailing byte
1441     __ testl(byte_count, 1);
1442     __ jcc(Assembler::zero, L_copy_2_bytes);
1443     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1444     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1445     __ decrement(byte_count); // Adjust for possible trailing word
1446 
1447     // Check for and copy trailing word
1448   __ BIND(L_copy_2_bytes);
1449     __ testl(byte_count, 2);
1450     __ jcc(Assembler::zero, L_copy_4_bytes);
1451     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1452     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1453 
1454     // Check for and copy trailing dword
1455   __ BIND(L_copy_4_bytes);
1456     __ testl(byte_count, 4);
1457     __ jcc(Assembler::zero, L_copy_bytes);
1458     __ movl(rax, Address(from, qword_count, Address::times_8));
1459     __ movl(Address(to, qword_count, Address::times_8), rax);
1460     __ jmp(L_copy_bytes);
1461 
1462     // Copy trailing qwords
1463   __ BIND(L_copy_8_bytes);
1464     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1465     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1466     __ decrement(qword_count);
1467     __ jcc(Assembler::notZero, L_copy_8_bytes);
1468   }
1469   restore_arg_regs();
1470   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1471   __ xorptr(rax, rax); // return 0
1472   __ vzeroupper();
1473   __ leave(); // required for proper stackwalking of RuntimeStub frame
1474   __ ret(0);
1475 
1476   {
1477     // UnsafeCopyMemory page error: continue after ucm
1478     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1479     // Copy in multi-bytes chunks
1480     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1481   }
1482   restore_arg_regs();
1483   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1484   __ xorptr(rax, rax); // return 0
1485   __ vzeroupper();
1486   __ leave(); // required for proper stackwalking of RuntimeStub frame
1487   __ ret(0);
1488 
1489   return start;
1490 }
1491 
1492 
1493 // Arguments:
1494 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1495 //             ignored
1496 //   name    - stub name string
1497 //
1498 // Inputs:
1499 //   c_rarg0   - source array address
1500 //   c_rarg1   - destination array address
1501 //   c_rarg2   - element count, treated as ssize_t, can be zero
1502 //
1503 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1504 // let the hardware handle it.  The two or four words within dwords
1505 // or qwords that span cache line boundaries will still be loaded
1506 // and stored atomically.
1507 //
1508 // Side Effects:
1509 //   disjoint_short_copy_entry is set to the no-overlap entry point
1510 //   used by generate_conjoint_short_copy().
1511 //
1512 address StubGenerator::generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1513 #if COMPILER2_OR_JVMCI
1514   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1515      return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
1516                                                aligned, false, false);
1517   }
1518 #endif
1519 
1520   __ align(CodeEntryAlignment);
1521   StubCodeMark mark(this, "StubRoutines", name);
1522   address start = __ pc();
1523   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1524 
1525   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1526   const Register from        = rdi;  // source array address
1527   const Register to          = rsi;  // destination array address
1528   const Register count       = rdx;  // elements count
1529   const Register word_count  = rcx;
1530   const Register qword_count = count;
1531   const Register end_from    = from; // source array end address
1532   const Register end_to      = to;   // destination array end address
1533   // End pointers are inclusive, and if count is not zero they point
1534   // to the last unit copied:  end_to[0] := end_from[0]
1535 
1536   __ enter(); // required for proper stackwalking of RuntimeStub frame
1537   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1538 
1539   if (entry != nullptr) {
1540     *entry = __ pc();
1541     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1542     BLOCK_COMMENT("Entry:");
1543   }
1544 
1545   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1546                     // r9 and r10 may be used to save non-volatile registers
1547 
1548   {
1549     // UnsafeCopyMemory page error: continue after ucm
1550     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1551     // 'from', 'to' and 'count' are now valid
1552     __ movptr(word_count, count);
1553     __ shrptr(count, 2); // count => qword_count
1554 
1555     // Copy from low to high addresses.  Use 'to' as scratch.
1556     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1557     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1558     __ negptr(qword_count);
1559     __ jmp(L_copy_bytes);
1560 
1561     // Copy trailing qwords
1562   __ BIND(L_copy_8_bytes);
1563     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1564     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1565     __ increment(qword_count);
1566     __ jcc(Assembler::notZero, L_copy_8_bytes);
1567 
1568     // Original 'dest' is trashed, so we can't use it as a
1569     // base register for a possible trailing word copy
1570 
1571     // Check for and copy trailing dword
1572   __ BIND(L_copy_4_bytes);
1573     __ testl(word_count, 2);
1574     __ jccb(Assembler::zero, L_copy_2_bytes);
1575     __ movl(rax, Address(end_from, 8));
1576     __ movl(Address(end_to, 8), rax);
1577 
1578     __ addptr(end_from, 4);
1579     __ addptr(end_to, 4);
1580 
1581     // Check for and copy trailing word
1582   __ BIND(L_copy_2_bytes);
1583     __ testl(word_count, 1);
1584     __ jccb(Assembler::zero, L_exit);
1585     __ movw(rax, Address(end_from, 8));
1586     __ movw(Address(end_to, 8), rax);
1587   }
1588 __ BIND(L_exit);
1589   address ucme_exit_pc = __ pc();
1590   restore_arg_regs();
1591   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1592   __ xorptr(rax, rax); // return 0
1593   __ vzeroupper();
1594   __ leave(); // required for proper stackwalking of RuntimeStub frame
1595   __ ret(0);
1596 
1597   {
1598     UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1599     // Copy in multi-bytes chunks
1600     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1601     __ jmp(L_copy_4_bytes);
1602   }
1603 
1604   return start;
1605 }
1606 
1607 
1608 address StubGenerator::generate_fill(BasicType t, bool aligned, const char *name) {
1609   __ align(CodeEntryAlignment);
1610   StubCodeMark mark(this, "StubRoutines", name);
1611   address start = __ pc();
1612 
1613   BLOCK_COMMENT("Entry:");
1614 
1615   const Register to       = c_rarg0;  // destination array address
1616   const Register value    = c_rarg1;  // value
1617   const Register count    = c_rarg2;  // elements count
1618   __ mov(r11, count);
1619 
1620   __ enter(); // required for proper stackwalking of RuntimeStub frame
1621 
1622   __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1623 
1624   __ vzeroupper();
1625   __ leave(); // required for proper stackwalking of RuntimeStub frame
1626   __ ret(0);
1627 
1628   return start;
1629 }
1630 
1631 
1632 // Arguments:
1633 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1634 //             ignored
1635 //   name    - stub name string
1636 //
1637 // Inputs:
1638 //   c_rarg0   - source array address
1639 //   c_rarg1   - destination array address
1640 //   c_rarg2   - element count, treated as ssize_t, can be zero
1641 //
1642 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1643 // let the hardware handle it.  The two or four words within dwords
1644 // or qwords that span cache line boundaries will still be loaded
1645 // and stored atomically.
1646 //
1647 address StubGenerator::generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1648                                                     address *entry, const char *name) {
1649 #if COMPILER2_OR_JVMCI
1650   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1651      return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
1652                                                nooverlap_target, aligned, false, false);
1653   }
1654 #endif
1655   __ align(CodeEntryAlignment);
1656   StubCodeMark mark(this, "StubRoutines", name);
1657   address start = __ pc();
1658   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1659 
1660   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1661   const Register from        = rdi;  // source array address
1662   const Register to          = rsi;  // destination array address
1663   const Register count       = rdx;  // elements count
1664   const Register word_count  = rcx;
1665   const Register qword_count = count;
1666 
1667   __ enter(); // required for proper stackwalking of RuntimeStub frame
1668   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1669 
1670   if (entry != nullptr) {
1671     *entry = __ pc();
1672     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1673     BLOCK_COMMENT("Entry:");
1674   }
1675 
1676   array_overlap_test(nooverlap_target, Address::times_2);
1677   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1678                     // r9 and r10 may be used to save non-volatile registers
1679 
1680   {
1681     // UnsafeCopyMemory page error: continue after ucm
1682     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1683     // 'from', 'to' and 'count' are now valid
1684     __ movptr(word_count, count);
1685     __ shrptr(count, 2); // count => qword_count
1686 
1687     // Copy from high to low addresses.  Use 'to' as scratch.
1688 
1689     // Check for and copy trailing word
1690     __ testl(word_count, 1);
1691     __ jccb(Assembler::zero, L_copy_4_bytes);
1692     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1693     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1694 
1695    // Check for and copy trailing dword
1696   __ BIND(L_copy_4_bytes);
1697     __ testl(word_count, 2);
1698     __ jcc(Assembler::zero, L_copy_bytes);
1699     __ movl(rax, Address(from, qword_count, Address::times_8));
1700     __ movl(Address(to, qword_count, Address::times_8), rax);
1701     __ jmp(L_copy_bytes);
1702 
1703     // Copy trailing qwords
1704   __ BIND(L_copy_8_bytes);
1705     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1706     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1707     __ decrement(qword_count);
1708     __ jcc(Assembler::notZero, L_copy_8_bytes);
1709   }
1710   restore_arg_regs();
1711   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1712   __ xorptr(rax, rax); // return 0
1713   __ vzeroupper();
1714   __ leave(); // required for proper stackwalking of RuntimeStub frame
1715   __ ret(0);
1716 
1717   {
1718     // UnsafeCopyMemory page error: continue after ucm
1719     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1720     // Copy in multi-bytes chunks
1721     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1722   }
1723   restore_arg_regs();
1724   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1725   __ xorptr(rax, rax); // return 0
1726   __ vzeroupper();
1727   __ leave(); // required for proper stackwalking of RuntimeStub frame
1728   __ ret(0);
1729 
1730   return start;
1731 }
1732 
1733 
1734 // Arguments:
1735 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1736 //             ignored
1737 //   is_oop  - true => oop array, so generate store check code
1738 //   name    - stub name string
1739 //
1740 // Inputs:
1741 //   c_rarg0   - source array address
1742 //   c_rarg1   - destination array address
1743 //   c_rarg2   - element count, treated as ssize_t, can be zero
1744 //
1745 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1746 // the hardware handle it.  The two dwords within qwords that span
1747 // cache line boundaries will still be loaded and stored atomically.
1748 //
1749 // Side Effects:
1750 //   disjoint_int_copy_entry is set to the no-overlap entry point
1751 //   used by generate_conjoint_int_oop_copy().
1752 //
1753 address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1754                                                       const char *name, bool dest_uninitialized) {
1755   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1756 #if COMPILER2_OR_JVMCI
1757   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1758      return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
1759                                                aligned, is_oop, dest_uninitialized);
1760   }
1761 #endif
1762 
1763   __ align(CodeEntryAlignment);
1764   StubCodeMark mark(this, "StubRoutines", name);
1765   address start = __ pc();
1766 
1767   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1768   const Register from        = rdi;  // source array address
1769   const Register to          = rsi;  // destination array address
1770   const Register count       = rdx;  // elements count
1771   const Register dword_count = rcx;
1772   const Register qword_count = count;
1773   const Register end_from    = from; // source array end address
1774   const Register end_to      = to;   // destination array end address
1775   // End pointers are inclusive, and if count is not zero they point
1776   // to the last unit copied:  end_to[0] := end_from[0]
1777 
1778   __ enter(); // required for proper stackwalking of RuntimeStub frame
1779   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1780 
1781   if (entry != nullptr) {
1782     *entry = __ pc();
1783     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1784     BLOCK_COMMENT("Entry:");
1785   }
1786 
1787   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1788                                  // r9 is used to save r15_thread
1789 
1790   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1791   if (dest_uninitialized) {
1792     decorators |= IS_DEST_UNINITIALIZED;
1793   }
1794   if (aligned) {
1795     decorators |= ARRAYCOPY_ALIGNED;
1796   }
1797 
1798   BasicType type = is_oop ? T_OBJECT : T_INT;
1799   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1800 
1801   {
1802     // UnsafeCopyMemory page error: continue after ucm
1803     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1804     // 'from', 'to' and 'count' are now valid
1805     __ movptr(dword_count, count);
1806     __ shrptr(count, 1); // count => qword_count
1807 
1808     // Copy from low to high addresses.  Use 'to' as scratch.
1809     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1810     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1811     __ negptr(qword_count);
1812     __ jmp(L_copy_bytes);
1813 
1814     // Copy trailing qwords
1815   __ BIND(L_copy_8_bytes);
1816     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1817     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1818     __ increment(qword_count);
1819     __ jcc(Assembler::notZero, L_copy_8_bytes);
1820 
1821     // Check for and copy trailing dword
1822   __ BIND(L_copy_4_bytes);
1823     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1824     __ jccb(Assembler::zero, L_exit);
1825     __ movl(rax, Address(end_from, 8));
1826     __ movl(Address(end_to, 8), rax);
1827   }
1828 __ BIND(L_exit);
1829   address ucme_exit_pc = __ pc();
1830   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1831   restore_arg_regs_using_thread();
1832   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1833   __ vzeroupper();
1834   __ xorptr(rax, rax); // return 0
1835   __ leave(); // required for proper stackwalking of RuntimeStub frame
1836   __ ret(0);
1837 
1838   {
1839     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
1840     // Copy in multi-bytes chunks
1841     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
1842     __ jmp(L_copy_4_bytes);
1843   }
1844 
1845   return start;
1846 }
1847 
1848 
1849 // Arguments:
1850 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1851 //             ignored
1852 //   is_oop  - true => oop array, so generate store check code
1853 //   name    - stub name string
1854 //
1855 // Inputs:
1856 //   c_rarg0   - source array address
1857 //   c_rarg1   - destination array address
1858 //   c_rarg2   - element count, treated as ssize_t, can be zero
1859 //
1860 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1861 // the hardware handle it.  The two dwords within qwords that span
1862 // cache line boundaries will still be loaded and stored atomically.
1863 //
1864 address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1865                                                       address *entry, const char *name,
1866                                                       bool dest_uninitialized) {
1867   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1868 #if COMPILER2_OR_JVMCI
1869   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1870      return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
1871                                                nooverlap_target, aligned, is_oop, dest_uninitialized);
1872   }
1873 #endif
1874   __ align(CodeEntryAlignment);
1875   StubCodeMark mark(this, "StubRoutines", name);
1876   address start = __ pc();
1877 
1878   Label L_copy_bytes, L_copy_8_bytes, L_exit;
1879   const Register from        = rdi;  // source array address
1880   const Register to          = rsi;  // destination array address
1881   const Register count       = rdx;  // elements count
1882   const Register dword_count = rcx;
1883   const Register qword_count = count;
1884 
1885   __ enter(); // required for proper stackwalking of RuntimeStub frame
1886   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1887 
1888   if (entry != nullptr) {
1889     *entry = __ pc();
1890      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1891     BLOCK_COMMENT("Entry:");
1892   }
1893 
1894   array_overlap_test(nooverlap_target, Address::times_4);
1895   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1896                                  // r9 is used to save r15_thread
1897 
1898   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1899   if (dest_uninitialized) {
1900     decorators |= IS_DEST_UNINITIALIZED;
1901   }
1902   if (aligned) {
1903     decorators |= ARRAYCOPY_ALIGNED;
1904   }
1905 
1906   BasicType type = is_oop ? T_OBJECT : T_INT;
1907   // no registers are destroyed by this call
1908   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1909 
1910   assert_clean_int(count, rax); // Make sure 'count' is clean int.
1911   {
1912     // UnsafeCopyMemory page error: continue after ucm
1913     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1914     // 'from', 'to' and 'count' are now valid
1915     __ movptr(dword_count, count);
1916     __ shrptr(count, 1); // count => qword_count
1917 
1918     // Copy from high to low addresses.  Use 'to' as scratch.
1919 
1920     // Check for and copy trailing dword
1921     __ testl(dword_count, 1);
1922     __ jcc(Assembler::zero, L_copy_bytes);
1923     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1924     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1925     __ jmp(L_copy_bytes);
1926 
1927     // Copy trailing qwords
1928   __ BIND(L_copy_8_bytes);
1929     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1930     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1931     __ decrement(qword_count);
1932     __ jcc(Assembler::notZero, L_copy_8_bytes);
1933   }
1934   if (is_oop) {
1935     __ jmp(L_exit);
1936   }
1937   restore_arg_regs_using_thread();
1938   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1939   __ xorptr(rax, rax); // return 0
1940   __ vzeroupper();
1941   __ leave(); // required for proper stackwalking of RuntimeStub frame
1942   __ ret(0);
1943 
1944   {
1945     // UnsafeCopyMemory page error: continue after ucm
1946     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1947     // Copy in multi-bytes chunks
1948     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
1949   }
1950 
1951 __ BIND(L_exit);
1952   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1953   restore_arg_regs_using_thread();
1954   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1955   __ xorptr(rax, rax); // return 0
1956   __ vzeroupper();
1957   __ leave(); // required for proper stackwalking of RuntimeStub frame
1958   __ ret(0);
1959 
1960   return start;
1961 }
1962 
1963 
1964 // Arguments:
1965 //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1966 //             ignored
1967 //   is_oop  - true => oop array, so generate store check code
1968 //   name    - stub name string
1969 //
1970 // Inputs:
1971 //   c_rarg0   - source array address
1972 //   c_rarg1   - destination array address
1973 //   c_rarg2   - element count, treated as ssize_t, can be zero
1974 //
1975  // Side Effects:
1976 //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1977 //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1978 //
1979 address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
1980                                                        const char *name, bool dest_uninitialized) {
1981   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1982 #if COMPILER2_OR_JVMCI
1983   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1984      return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
1985                                                aligned, is_oop, dest_uninitialized);
1986   }
1987 #endif
1988   __ align(CodeEntryAlignment);
1989   StubCodeMark mark(this, "StubRoutines", name);
1990   address start = __ pc();
1991 
1992   Label L_copy_bytes, L_copy_8_bytes, L_exit;
1993   const Register from        = rdi;  // source array address
1994   const Register to          = rsi;  // destination array address
1995   const Register qword_count = rdx;  // elements count
1996   const Register end_from    = from; // source array end address
1997   const Register end_to      = rcx;  // destination array end address
1998   const Register saved_count = r11;
1999   // End pointers are inclusive, and if count is not zero they point
2000   // to the last unit copied:  end_to[0] := end_from[0]
2001 
2002   __ enter(); // required for proper stackwalking of RuntimeStub frame
2003   // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2004   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2005 
2006   if (entry != nullptr) {
2007     *entry = __ pc();
2008     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2009     BLOCK_COMMENT("Entry:");
2010   }
2011 
2012   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2013                                    // r9 is used to save r15_thread
2014   // 'from', 'to' and 'qword_count' are now valid
2015 
2016   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2017   if (dest_uninitialized) {
2018     decorators |= IS_DEST_UNINITIALIZED;
2019   }
2020   if (aligned) {
2021     decorators |= ARRAYCOPY_ALIGNED;
2022   }
2023 
2024   BasicType type = is_oop ? T_OBJECT : T_LONG;
2025   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2026   {
2027     // UnsafeCopyMemory page error: continue after ucm
2028     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2029 
2030     // Copy from low to high addresses.  Use 'to' as scratch.
2031     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2032     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2033     __ negptr(qword_count);
2034     __ jmp(L_copy_bytes);
2035 
2036     // Copy trailing qwords
2037   __ BIND(L_copy_8_bytes);
2038     bs->copy_load_at(_masm, decorators, type, 8,
2039                      rax, Address(end_from, qword_count, Address::times_8, 8),
2040                      r10);
2041     bs->copy_store_at(_masm, decorators, type, 8,
2042                       Address(end_to, qword_count, Address::times_8, 8), rax,
2043                       r10);
2044     __ increment(qword_count);
2045     __ jcc(Assembler::notZero, L_copy_8_bytes);
2046   }
2047   if (is_oop) {
2048     __ jmp(L_exit);
2049   } else {
2050     restore_arg_regs_using_thread();
2051     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2052     __ xorptr(rax, rax); // return 0
2053     __ vzeroupper();
2054     __ leave(); // required for proper stackwalking of RuntimeStub frame
2055     __ ret(0);
2056   }
2057 
2058   {
2059     // UnsafeCopyMemory page error: continue after ucm
2060     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2061     // Copy in multi-bytes chunks
2062     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2063   }
2064 
2065   __ BIND(L_exit);
2066   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2067   restore_arg_regs_using_thread();
2068   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2069                           SharedRuntime::_jlong_array_copy_ctr,
2070                  rscratch1); // Update counter after rscratch1 is free
2071   __ vzeroupper();
2072   __ xorptr(rax, rax); // return 0
2073   __ leave(); // required for proper stackwalking of RuntimeStub frame
2074   __ ret(0);
2075 
2076   return start;
2077 }
2078 
2079 
2080 // Arguments:
2081 //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2082 //             ignored
2083 //   is_oop  - true => oop array, so generate store check code
2084 //   name    - stub name string
2085 //
2086 // Inputs:
2087 //   c_rarg0   - source array address
2088 //   c_rarg1   - destination array address
2089 //   c_rarg2   - element count, treated as ssize_t, can be zero
2090 //
2091 address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2092                                                        address *entry, const char *name,
2093                                                        bool dest_uninitialized) {
2094   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2095 #if COMPILER2_OR_JVMCI
2096   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2097      return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2098                                                nooverlap_target, aligned, is_oop, dest_uninitialized);
2099   }
2100 #endif
2101   __ align(CodeEntryAlignment);
2102   StubCodeMark mark(this, "StubRoutines", name);
2103   address start = __ pc();
2104 
2105   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2106   const Register from        = rdi;  // source array address
2107   const Register to          = rsi;  // destination array address
2108   const Register qword_count = rdx;  // elements count
2109   const Register saved_count = rcx;
2110 
2111   __ enter(); // required for proper stackwalking of RuntimeStub frame
2112   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2113 
2114   if (entry != nullptr) {
2115     *entry = __ pc();
2116     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2117     BLOCK_COMMENT("Entry:");
2118   }
2119 
2120   array_overlap_test(nooverlap_target, Address::times_8);
2121   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2122                                  // r9 is used to save r15_thread
2123   // 'from', 'to' and 'qword_count' are now valid
2124 
2125   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2126   if (dest_uninitialized) {
2127     decorators |= IS_DEST_UNINITIALIZED;
2128   }
2129   if (aligned) {
2130     decorators |= ARRAYCOPY_ALIGNED;
2131   }
2132 
2133   BasicType type = is_oop ? T_OBJECT : T_LONG;
2134   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2135   {
2136     // UnsafeCopyMemory page error: continue after ucm
2137     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2138 
2139     __ jmp(L_copy_bytes);
2140 
2141     // Copy trailing qwords
2142   __ BIND(L_copy_8_bytes);
2143     bs->copy_load_at(_masm, decorators, type, 8,
2144                      rax, Address(from, qword_count, Address::times_8, -8),
2145                      r10);
2146     bs->copy_store_at(_masm, decorators, type, 8,
2147                       Address(to, qword_count, Address::times_8, -8), rax,
2148                       r10);
2149     __ decrement(qword_count);
2150     __ jcc(Assembler::notZero, L_copy_8_bytes);
2151   }
2152   if (is_oop) {
2153     __ jmp(L_exit);
2154   } else {
2155     restore_arg_regs_using_thread();
2156     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2157     __ xorptr(rax, rax); // return 0
2158     __ vzeroupper();
2159     __ leave(); // required for proper stackwalking of RuntimeStub frame
2160     __ ret(0);
2161   }
2162   {
2163     // UnsafeCopyMemory page error: continue after ucm
2164     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2165 
2166     // Copy in multi-bytes chunks
2167     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2168   }
2169   __ BIND(L_exit);
2170   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2171   restore_arg_regs_using_thread();
2172   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2173                           SharedRuntime::_jlong_array_copy_ctr,
2174                  rscratch1); // Update counter after rscratch1 is free
2175   __ vzeroupper();
2176   __ xorptr(rax, rax); // return 0
2177   __ leave(); // required for proper stackwalking of RuntimeStub frame
2178   __ ret(0);
2179 
2180   return start;
2181 }
2182 
2183 
2184 // Helper for generating a dynamic type check.
2185 // Smashes no registers.
2186 void StubGenerator::generate_type_check(Register sub_klass,
2187                                         Register super_check_offset,
2188                                         Register super_klass,
2189                                         Label& L_success) {
2190   assert_different_registers(sub_klass, super_check_offset, super_klass);
2191 
2192   BLOCK_COMMENT("type_check:");
2193 
2194   Label L_miss;
2195 
2196   __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
2197                                    super_check_offset);
2198   __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2199 
2200   // Fall through on failure!
2201   __ BIND(L_miss);
2202 }
2203 
2204 //
2205 //  Generate checkcasting array copy stub
2206 //
2207 //  Input:
2208 //    c_rarg0   - source array address
2209 //    c_rarg1   - destination array address
2210 //    c_rarg2   - element count, treated as ssize_t, can be zero
2211 //    c_rarg3   - size_t ckoff (super_check_offset)
2212 // not Win64
2213 //    c_rarg4   - oop ckval (super_klass)
2214 // Win64
2215 //    rsp+40    - oop ckval (super_klass)
2216 //
2217 //  Output:
2218 //    rax ==  0  -  success
2219 //    rax == -1^K - failure, where K is partial transfer count
2220 //
2221 address StubGenerator::generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized) {
2222 
2223   Label L_load_element, L_store_element, L_do_card_marks, L_done;
2224 
2225   // Input registers (after setup_arg_regs)
2226   const Register from        = rdi;   // source array address
2227   const Register to          = rsi;   // destination array address
2228   const Register length      = rdx;   // elements count
2229   const Register ckoff       = rcx;   // super_check_offset
2230   const Register ckval       = r8;    // super_klass
2231 
2232   // Registers used as temps (r13, r14 are save-on-entry)
2233   const Register end_from    = from;  // source array end address
2234   const Register end_to      = r13;   // destination array end address
2235   const Register count       = rdx;   // -(count_remaining)
2236   const Register r14_length  = r14;   // saved copy of length
2237   // End pointers are inclusive, and if length is not zero they point
2238   // to the last unit copied:  end_to[0] := end_from[0]
2239 
2240   const Register rax_oop    = rax;    // actual oop copied
2241   const Register r11_klass  = r11;    // oop._klass
2242 
2243   //---------------------------------------------------------------
2244   // Assembler stub will be used for this call to arraycopy
2245   // if the two arrays are subtypes of Object[] but the
2246   // destination array type is not equal to or a supertype
2247   // of the source type.  Each element must be separately
2248   // checked.
2249 
2250   __ align(CodeEntryAlignment);
2251   StubCodeMark mark(this, "StubRoutines", name);
2252   address start = __ pc();
2253 
2254   __ enter(); // required for proper stackwalking of RuntimeStub frame
2255 
2256 #ifdef ASSERT
2257   // caller guarantees that the arrays really are different
2258   // otherwise, we would have to make conjoint checks
2259   { Label L;
2260     array_overlap_test(L, TIMES_OOP);
2261     __ stop("checkcast_copy within a single array");
2262     __ bind(L);
2263   }
2264 #endif //ASSERT
2265 
2266   setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2267                                   // ckoff => rcx, ckval => r8
2268                                   // r9 is used to save r15_thread
2269 #ifdef _WIN64
2270   // last argument (#4) is on stack on Win64
2271   __ movptr(ckval, Address(rsp, 6 * wordSize));
2272 #endif
2273 
2274   // Caller of this entry point must set up the argument registers.
2275   if (entry != nullptr) {
2276     *entry = __ pc();
2277     BLOCK_COMMENT("Entry:");
2278   }
2279 
2280   // allocate spill slots for r13, r14
2281   enum {
2282     saved_r13_offset,
2283     saved_r14_offset,
2284     saved_r10_offset,
2285     saved_rbp_offset
2286   };
2287   __ subptr(rsp, saved_rbp_offset * wordSize);
2288   __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2289   __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2290   __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2291 
2292 #ifdef ASSERT
2293     Label L2;
2294     __ get_thread(r14);
2295     __ cmpptr(r15_thread, r14);
2296     __ jcc(Assembler::equal, L2);
2297     __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2298     __ bind(L2);
2299 #endif // ASSERT
2300 
2301   // check that int operands are properly extended to size_t
2302   assert_clean_int(length, rax);
2303   assert_clean_int(ckoff, rax);
2304 
2305 #ifdef ASSERT
2306   BLOCK_COMMENT("assert consistent ckoff/ckval");
2307   // The ckoff and ckval must be mutually consistent,
2308   // even though caller generates both.
2309   { Label L;
2310     int sco_offset = in_bytes(Klass::super_check_offset_offset());
2311     __ cmpl(ckoff, Address(ckval, sco_offset));
2312     __ jcc(Assembler::equal, L);
2313     __ stop("super_check_offset inconsistent");
2314     __ bind(L);
2315   }
2316 #endif //ASSERT
2317 
2318   // Loop-invariant addresses.  They are exclusive end pointers.
2319   Address end_from_addr(from, length, TIMES_OOP, 0);
2320   Address   end_to_addr(to,   length, TIMES_OOP, 0);
2321   // Loop-variant addresses.  They assume post-incremented count < 0.
2322   Address from_element_addr(end_from, count, TIMES_OOP, 0);
2323   Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2324 
2325   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2326   if (dest_uninitialized) {
2327     decorators |= IS_DEST_UNINITIALIZED;
2328   }
2329 
2330   BasicType type = T_OBJECT;
2331   size_t element_size = UseCompressedOops ? 4 : 8;
2332 
2333   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2334   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2335 
2336   // Copy from low to high addresses, indexed from the end of each array.
2337   __ lea(end_from, end_from_addr);
2338   __ lea(end_to,   end_to_addr);
2339   __ movptr(r14_length, length);        // save a copy of the length
2340   assert(length == count, "");          // else fix next line:
2341   __ negptr(count);                     // negate and test the length
2342   __ jcc(Assembler::notZero, L_load_element);
2343 
2344   // Empty array:  Nothing to do.
2345   __ xorptr(rax, rax);                  // return 0 on (trivial) success
2346   __ jmp(L_done);
2347 
2348   // ======== begin loop ========
2349   // (Loop is rotated; its entry is L_load_element.)
2350   // Loop control:
2351   //   for (count = -count; count != 0; count++)
2352   // Base pointers src, dst are biased by 8*(count-1),to last element.
2353   __ align(OptoLoopAlignment);
2354 
2355   __ BIND(L_store_element);
2356   bs->copy_store_at(_masm,
2357                     decorators,
2358                     type,
2359                     element_size,
2360                     to_element_addr,
2361                     rax_oop,
2362                     r10);
2363   __ increment(count);               // increment the count toward zero
2364   __ jcc(Assembler::zero, L_do_card_marks);
2365 
2366   // ======== loop entry is here ========
2367   __ BIND(L_load_element);
2368   bs->copy_load_at(_masm,
2369                    decorators,
2370                    type,
2371                    element_size,
2372                    rax_oop,
2373                    from_element_addr,
2374                    r10);
2375   __ testptr(rax_oop, rax_oop);
2376   __ jcc(Assembler::zero, L_store_element);
2377 
2378   __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2379   generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2380   // ======== end loop ========
2381 
2382   // It was a real error; we must depend on the caller to finish the job.
2383   // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2384   // Emit GC store barriers for the oops we have copied (r14 + rdx),
2385   // and report their number to the caller.
2386   assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2387   Label L_post_barrier;
2388   __ addptr(r14_length, count);     // K = (original - remaining) oops
2389   __ movptr(rax, r14_length);       // save the value
2390   __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2391   __ jccb(Assembler::notZero, L_post_barrier);
2392   __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2393 
2394   // Come here on success only.
2395   __ BIND(L_do_card_marks);
2396   __ xorptr(rax, rax);              // return 0 on success
2397 
2398   __ BIND(L_post_barrier);
2399   bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2400 
2401   // Common exit point (success or failure).
2402   __ BIND(L_done);
2403   __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2404   __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2405   __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2406   restore_arg_regs_using_thread();
2407   INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2408   __ leave(); // required for proper stackwalking of RuntimeStub frame
2409   __ ret(0);
2410 
2411   return start;
2412 }
2413 
2414 
2415 //  Generate 'unsafe' array copy stub
2416 //  Though just as safe as the other stubs, it takes an unscaled
2417 //  size_t argument instead of an element count.
2418 //
2419 //  Input:
2420 //    c_rarg0   - source array address
2421 //    c_rarg1   - destination array address
2422 //    c_rarg2   - byte count, treated as ssize_t, can be zero
2423 //
2424 // Examines the alignment of the operands and dispatches
2425 // to a long, int, short, or byte copy loop.
2426 //
2427 address StubGenerator::generate_unsafe_copy(const char *name,
2428                                             address byte_copy_entry, address short_copy_entry,
2429                                             address int_copy_entry, address long_copy_entry) {
2430 
2431   Label L_long_aligned, L_int_aligned, L_short_aligned;
2432 
2433   // Input registers (before setup_arg_regs)
2434   const Register from        = c_rarg0;  // source array address
2435   const Register to          = c_rarg1;  // destination array address
2436   const Register size        = c_rarg2;  // byte count (size_t)
2437 
2438   // Register used as a temp
2439   const Register bits        = rax;      // test copy of low bits
2440 
2441   __ align(CodeEntryAlignment);
2442   StubCodeMark mark(this, "StubRoutines", name);
2443   address start = __ pc();
2444 
2445   __ enter(); // required for proper stackwalking of RuntimeStub frame
2446 
2447   // bump this on entry, not on exit:
2448   INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
2449 
2450   __ mov(bits, from);
2451   __ orptr(bits, to);
2452   __ orptr(bits, size);
2453 
2454   __ testb(bits, BytesPerLong-1);
2455   __ jccb(Assembler::zero, L_long_aligned);
2456 
2457   __ testb(bits, BytesPerInt-1);
2458   __ jccb(Assembler::zero, L_int_aligned);
2459 
2460   __ testb(bits, BytesPerShort-1);
2461   __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2462 
2463   __ BIND(L_short_aligned);
2464   __ shrptr(size, LogBytesPerShort); // size => short_count
2465   __ jump(RuntimeAddress(short_copy_entry));
2466 
2467   __ BIND(L_int_aligned);
2468   __ shrptr(size, LogBytesPerInt); // size => int_count
2469   __ jump(RuntimeAddress(int_copy_entry));
2470 
2471   __ BIND(L_long_aligned);
2472   __ shrptr(size, LogBytesPerLong); // size => qword_count
2473   __ jump(RuntimeAddress(long_copy_entry));
2474 
2475   return start;
2476 }
2477 
2478 
2479 // Perform range checks on the proposed arraycopy.
2480 // Kills temp, but nothing else.
2481 // Also, clean the sign bits of src_pos and dst_pos.
2482 void StubGenerator::arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2483                                            Register src_pos, // source position (c_rarg1)
2484                                            Register dst,     // destination array oo (c_rarg2)
2485                                            Register dst_pos, // destination position (c_rarg3)
2486                                            Register length,
2487                                            Register temp,
2488                                            Label& L_failed) {
2489   BLOCK_COMMENT("arraycopy_range_checks:");
2490 
2491   //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2492   __ movl(temp, length);
2493   __ addl(temp, src_pos);             // src_pos + length
2494   __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2495   __ jcc(Assembler::above, L_failed);
2496 
2497   //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2498   __ movl(temp, length);
2499   __ addl(temp, dst_pos);             // dst_pos + length
2500   __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2501   __ jcc(Assembler::above, L_failed);
2502 
2503   // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2504   // Move with sign extension can be used since they are positive.
2505   __ movslq(src_pos, src_pos);
2506   __ movslq(dst_pos, dst_pos);
2507 
2508   BLOCK_COMMENT("arraycopy_range_checks done");
2509 }
2510 
2511 
2512 //  Generate generic array copy stubs
2513 //
2514 //  Input:
2515 //    c_rarg0    -  src oop
2516 //    c_rarg1    -  src_pos (32-bits)
2517 //    c_rarg2    -  dst oop
2518 //    c_rarg3    -  dst_pos (32-bits)
2519 // not Win64
2520 //    c_rarg4    -  element count (32-bits)
2521 // Win64
2522 //    rsp+40     -  element count (32-bits)
2523 //
2524 //  Output:
2525 //    rax ==  0  -  success
2526 //    rax == -1^K - failure, where K is partial transfer count
2527 //
2528 address StubGenerator::generate_generic_copy(const char *name,
2529                                              address byte_copy_entry, address short_copy_entry,
2530                                              address int_copy_entry, address oop_copy_entry,
2531                                              address long_copy_entry, address checkcast_copy_entry) {
2532 
2533   Label L_failed, L_failed_0, L_objArray;
2534   Label L_copy_shorts, L_copy_ints, L_copy_longs;
2535 
2536   // Input registers
2537   const Register src        = c_rarg0;  // source array oop
2538   const Register src_pos    = c_rarg1;  // source position
2539   const Register dst        = c_rarg2;  // destination array oop
2540   const Register dst_pos    = c_rarg3;  // destination position
2541 #ifndef _WIN64
2542   const Register length     = c_rarg4;
2543   const Register rklass_tmp = r9;  // load_klass
2544 #else
2545   const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
2546   const Register rklass_tmp = rdi;  // load_klass
2547 #endif
2548 
2549   { int modulus = CodeEntryAlignment;
2550     int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2551     int advance = target - (__ offset() % modulus);
2552     if (advance < 0)  advance += modulus;
2553     if (advance > 0)  __ nop(advance);
2554   }
2555   StubCodeMark mark(this, "StubRoutines", name);
2556 
2557   // Short-hop target to L_failed.  Makes for denser prologue code.
2558   __ BIND(L_failed_0);
2559   __ jmp(L_failed);
2560   assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2561 
2562   __ align(CodeEntryAlignment);
2563   address start = __ pc();
2564 
2565   __ enter(); // required for proper stackwalking of RuntimeStub frame
2566 
2567 #ifdef _WIN64
2568   __ push(rklass_tmp); // rdi is callee-save on Windows
2569 #endif
2570 
2571   // bump this on entry, not on exit:
2572   INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
2573 
2574   //-----------------------------------------------------------------------
2575   // Assembler stub will be used for this call to arraycopy
2576   // if the following conditions are met:
2577   //
2578   // (1) src and dst must not be null.
2579   // (2) src_pos must not be negative.
2580   // (3) dst_pos must not be negative.
2581   // (4) length  must not be negative.
2582   // (5) src klass and dst klass should be the same and not null.
2583   // (6) src and dst should be arrays.
2584   // (7) src_pos + length must not exceed length of src.
2585   // (8) dst_pos + length must not exceed length of dst.
2586   //
2587 
2588   //  if (src == nullptr) return -1;
2589   __ testptr(src, src);         // src oop
2590   size_t j1off = __ offset();
2591   __ jccb(Assembler::zero, L_failed_0);
2592 
2593   //  if (src_pos < 0) return -1;
2594   __ testl(src_pos, src_pos); // src_pos (32-bits)
2595   __ jccb(Assembler::negative, L_failed_0);
2596 
2597   //  if (dst == nullptr) return -1;
2598   __ testptr(dst, dst);         // dst oop
2599   __ jccb(Assembler::zero, L_failed_0);
2600 
2601   //  if (dst_pos < 0) return -1;
2602   __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2603   size_t j4off = __ offset();
2604   __ jccb(Assembler::negative, L_failed_0);
2605 
2606   // The first four tests are very dense code,
2607   // but not quite dense enough to put four
2608   // jumps in a 16-byte instruction fetch buffer.
2609   // That's good, because some branch predicters
2610   // do not like jumps so close together.
2611   // Make sure of this.
2612   guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2613 
2614   // registers used as temp
2615   const Register r11_length    = r11; // elements count to copy
2616   const Register r10_src_klass = r10; // array klass
2617 
2618   //  if (length < 0) return -1;
2619   __ movl(r11_length, length);        // length (elements count, 32-bits value)
2620   __ testl(r11_length, r11_length);
2621   __ jccb(Assembler::negative, L_failed_0);
2622 
2623   __ load_klass(r10_src_klass, src, rklass_tmp);
2624 #ifdef ASSERT
2625   //  assert(src->klass() != nullptr);
2626   {
2627     BLOCK_COMMENT("assert klasses not null {");
2628     Label L1, L2;
2629     __ testptr(r10_src_klass, r10_src_klass);
2630     __ jcc(Assembler::notZero, L2);   // it is broken if klass is null
2631     __ bind(L1);
2632     __ stop("broken null klass");
2633     __ bind(L2);
2634     __ load_klass(rax, dst, rklass_tmp);
2635     __ cmpq(rax, 0);
2636     __ jcc(Assembler::equal, L1);     // this would be broken also
2637     BLOCK_COMMENT("} assert klasses not null done");
2638   }
2639 #endif
2640 
2641   // Load layout helper (32-bits)
2642   //
2643   //  |array_tag|     | header_size | element_type |     |log2_element_size|
2644   // 32        30    24            16              8     2                 0
2645   //
2646   //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2647   //
2648 
2649   const int lh_offset = in_bytes(Klass::layout_helper_offset());
2650 
2651   // Handle objArrays completely differently...
2652   const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2653   __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2654   __ jcc(Assembler::equal, L_objArray);
2655 
2656   //  if (src->klass() != dst->klass()) return -1;
2657   __ load_klass(rax, dst, rklass_tmp);
2658   __ cmpq(r10_src_klass, rax);
2659   __ jcc(Assembler::notEqual, L_failed);
2660 
2661   const Register rax_lh = rax;  // layout helper
2662   __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2663 
2664   //  if (!src->is_Array()) return -1;
2665   __ cmpl(rax_lh, Klass::_lh_neutral_value);
2666   __ jcc(Assembler::greaterEqual, L_failed);
2667 
2668   // At this point, it is known to be a typeArray (array_tag 0x3).
2669 #ifdef ASSERT
2670   {
2671     BLOCK_COMMENT("assert primitive array {");
2672     Label L;
2673     __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2674     __ jcc(Assembler::greaterEqual, L);
2675     __ stop("must be a primitive array");
2676     __ bind(L);
2677     BLOCK_COMMENT("} assert primitive array done");
2678   }
2679 #endif
2680 
2681   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2682                          r10, L_failed);
2683 
2684   // TypeArrayKlass
2685   //
2686   // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2687   // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2688   //
2689 
2690   const Register r10_offset = r10;    // array offset
2691   const Register rax_elsize = rax_lh; // element size
2692 
2693   __ movl(r10_offset, rax_lh);
2694   __ shrl(r10_offset, Klass::_lh_header_size_shift);
2695   __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2696   __ addptr(src, r10_offset);           // src array offset
2697   __ addptr(dst, r10_offset);           // dst array offset
2698   BLOCK_COMMENT("choose copy loop based on element size");
2699   __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2700 
2701 #ifdef _WIN64
2702   __ pop(rklass_tmp); // Restore callee-save rdi
2703 #endif
2704 
2705   // next registers should be set before the jump to corresponding stub
2706   const Register from     = c_rarg0;  // source array address
2707   const Register to       = c_rarg1;  // destination array address
2708   const Register count    = c_rarg2;  // elements count
2709 
2710   // 'from', 'to', 'count' registers should be set in such order
2711   // since they are the same as 'src', 'src_pos', 'dst'.
2712 
2713   __ cmpl(rax_elsize, 0);
2714   __ jccb(Assembler::notEqual, L_copy_shorts);
2715   __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2716   __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2717   __ movl2ptr(count, r11_length); // length
2718   __ jump(RuntimeAddress(byte_copy_entry));
2719 
2720 __ BIND(L_copy_shorts);
2721   __ cmpl(rax_elsize, LogBytesPerShort);
2722   __ jccb(Assembler::notEqual, L_copy_ints);
2723   __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2724   __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2725   __ movl2ptr(count, r11_length); // length
2726   __ jump(RuntimeAddress(short_copy_entry));
2727 
2728 __ BIND(L_copy_ints);
2729   __ cmpl(rax_elsize, LogBytesPerInt);
2730   __ jccb(Assembler::notEqual, L_copy_longs);
2731   __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2732   __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2733   __ movl2ptr(count, r11_length); // length
2734   __ jump(RuntimeAddress(int_copy_entry));
2735 
2736 __ BIND(L_copy_longs);
2737 #ifdef ASSERT
2738   {
2739     BLOCK_COMMENT("assert long copy {");
2740     Label L;
2741     __ cmpl(rax_elsize, LogBytesPerLong);
2742     __ jcc(Assembler::equal, L);
2743     __ stop("must be long copy, but elsize is wrong");
2744     __ bind(L);
2745     BLOCK_COMMENT("} assert long copy done");
2746   }
2747 #endif
2748   __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2749   __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2750   __ movl2ptr(count, r11_length); // length
2751   __ jump(RuntimeAddress(long_copy_entry));
2752 
2753   // ObjArrayKlass
2754 __ BIND(L_objArray);
2755   // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2756 
2757   Label L_plain_copy, L_checkcast_copy;
2758   //  test array classes for subtyping
2759   __ load_klass(rax, dst, rklass_tmp);
2760   __ cmpq(r10_src_klass, rax); // usual case is exact equality
2761   __ jcc(Assembler::notEqual, L_checkcast_copy);
2762 
2763   // Identically typed arrays can be copied without element-wise checks.
2764   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2765                          r10, L_failed);
2766 
2767   __ lea(from, Address(src, src_pos, TIMES_OOP,
2768                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2769   __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2770                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2771   __ movl2ptr(count, r11_length); // length
2772 __ BIND(L_plain_copy);
2773 #ifdef _WIN64
2774   __ pop(rklass_tmp); // Restore callee-save rdi
2775 #endif
2776   __ jump(RuntimeAddress(oop_copy_entry));
2777 
2778 __ BIND(L_checkcast_copy);
2779   // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2780   {
2781     // Before looking at dst.length, make sure dst is also an objArray.
2782     __ cmpl(Address(rax, lh_offset), objArray_lh);
2783     __ jcc(Assembler::notEqual, L_failed);
2784 
2785     // It is safe to examine both src.length and dst.length.
2786     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2787                            rax, L_failed);
2788 
2789     const Register r11_dst_klass = r11;
2790     __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
2791 
2792     // Marshal the base address arguments now, freeing registers.
2793     __ lea(from, Address(src, src_pos, TIMES_OOP,
2794                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2795     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2796                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2797     __ movl(count, length);           // length (reloaded)
2798     Register sco_temp = c_rarg3;      // this register is free now
2799     assert_different_registers(from, to, count, sco_temp,
2800                                r11_dst_klass, r10_src_klass);
2801     assert_clean_int(count, sco_temp);
2802 
2803     // Generate the type check.
2804     const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2805     __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2806     assert_clean_int(sco_temp, rax);
2807     generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2808 
2809     // Fetch destination element klass from the ObjArrayKlass header.
2810     int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2811     __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2812     __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
2813     assert_clean_int(sco_temp, rax);
2814 
2815 #ifdef _WIN64
2816     __ pop(rklass_tmp); // Restore callee-save rdi
2817 #endif
2818 
2819     // the checkcast_copy loop needs two extra arguments:
2820     assert(c_rarg3 == sco_temp, "#3 already in place");
2821     // Set up arguments for checkcast_copy_entry.
2822     setup_arg_regs_using_thread(4);
2823     __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2824     __ jump(RuntimeAddress(checkcast_copy_entry));
2825   }
2826 
2827 __ BIND(L_failed);
2828 #ifdef _WIN64
2829   __ pop(rklass_tmp); // Restore callee-save rdi
2830 #endif
2831   __ xorptr(rax, rax);
2832   __ notptr(rax); // return -1
2833   __ leave();   // required for proper stackwalking of RuntimeStub frame
2834   __ ret(0);
2835 
2836   return start;
2837 }
2838 
2839 #undef __