1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/objArrayKlass.hpp"
  30 #include "runtime/sharedRuntime.hpp"
  31 #include "runtime/stubRoutines.hpp"
  32 #include "stubGenerator_x86_64.hpp"
  33 #ifdef COMPILER2
  34 #include "opto/c2_globals.hpp"
  35 #endif
  36 #if INCLUDE_JVMCI
  37 #include "jvmci/jvmci_globals.hpp"
  38 #endif
  39 
  40 #define __ _masm->
  41 
  42 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #else
  47 #define BLOCK_COMMENT(str) __ block_comment(str)
  48 #endif // PRODUCT
  49 
  50 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  51 
  52 #ifdef PRODUCT
  53 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
  54 #else
  55 #define INC_COUNTER_NP(counter, rscratch) \
  56 BLOCK_COMMENT("inc_counter " #counter); \
  57 inc_counter_np(_masm, counter, rscratch);
  58 
  59 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
  60   __ incrementl(ExternalAddress((address)&counter), rscratch);
  61 }
  62 
  63 #if COMPILER2_OR_JVMCI
  64 static uint& get_profile_ctr(int shift) {
  65   if (shift == 0) {
  66     return SharedRuntime::_jbyte_array_copy_ctr;
  67   } else if (shift == 1) {
  68     return SharedRuntime::_jshort_array_copy_ctr;
  69   } else if (shift == 2) {
  70     return SharedRuntime::_jint_array_copy_ctr;
  71   } else {
  72     assert(shift == 3, "");
  73     return SharedRuntime::_jlong_array_copy_ctr;
  74   }
  75 }
  76 #endif // COMPILER2_OR_JVMCI
  77 #endif // !PRODUCT
  78 
  79 void StubGenerator::generate_arraycopy_stubs() {
  80   address entry;
  81   address entry_jbyte_arraycopy;
  82   address entry_jshort_arraycopy;
  83   address entry_jint_arraycopy;
  84   address entry_oop_arraycopy;
  85   address entry_jlong_arraycopy;
  86   address entry_checkcast_arraycopy;
  87 
  88   StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
  89                                                                          "jbyte_disjoint_arraycopy");
  90   StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
  91                                                                          "jbyte_arraycopy");
  92 
  93   StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
  94                                                                           "jshort_disjoint_arraycopy");
  95   StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
  96                                                                           "jshort_arraycopy");
  97 
  98   StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
  99                                                                             "jint_disjoint_arraycopy");
 100   StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
 101                                                                             &entry_jint_arraycopy, "jint_arraycopy");
 102 
 103   StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
 104                                                                              "jlong_disjoint_arraycopy");
 105   StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
 106                                                                              &entry_jlong_arraycopy, "jlong_arraycopy");
 107   if (UseCompressedOops) {
 108     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
 109                                                                             "oop_disjoint_arraycopy");
 110     StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
 111                                                                             &entry_oop_arraycopy, "oop_arraycopy");
 112     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
 113                                                                                    "oop_disjoint_arraycopy_uninit",
 114                                                                                    /*dest_uninitialized*/true);
 115     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
 116                                                                                    nullptr, "oop_arraycopy_uninit",
 117                                                                                    /*dest_uninitialized*/true);
 118   } else {
 119     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
 120                                                                              "oop_disjoint_arraycopy");
 121     StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
 122                                                                              &entry_oop_arraycopy, "oop_arraycopy");
 123     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
 124                                                                                     "oop_disjoint_arraycopy_uninit",
 125                                                                                     /*dest_uninitialized*/true);
 126     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
 127                                                                                     nullptr, "oop_arraycopy_uninit",
 128                                                                                     /*dest_uninitialized*/true);
 129   }
 130 
 131   StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
 132   StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
 133                                                                       /*dest_uninitialized*/true);
 134 
 135   StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
 136                                                             entry_jbyte_arraycopy,
 137                                                             entry_jshort_arraycopy,
 138                                                             entry_jint_arraycopy,
 139                                                             entry_jlong_arraycopy);
 140   StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
 141                                                              entry_jbyte_arraycopy,
 142                                                              entry_jshort_arraycopy,
 143                                                              entry_jint_arraycopy,
 144                                                              entry_oop_arraycopy,
 145                                                              entry_jlong_arraycopy,
 146                                                              entry_checkcast_arraycopy);
 147 
 148   StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
 149   StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
 150   StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
 151   StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
 152   StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
 153   StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
 154 
 155   StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory("unsafe_setmemory", StubRoutines::_jbyte_fill);
 156 
 157   // We don't generate specialized code for HeapWord-aligned source
 158   // arrays, so just use the code we've already generated
 159   StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
 160   StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
 161 
 162   StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
 163   StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
 164 
 165   StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
 166   StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
 167 
 168   StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
 169   StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
 170 
 171   StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
 172   StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
 173 
 174   StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
 175   StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
 176 }
 177 
 178 
 179 // Verify that a register contains clean 32-bits positive value
 180 // (high 32-bits are 0) so it could be used in 64-bits shifts.
 181 //
 182 //  Input:
 183 //    Rint  -  32-bits value
 184 //    Rtmp  -  scratch
 185 //
 186 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
 187 #ifdef ASSERT
 188   Label L;
 189   assert_different_registers(Rtmp, Rint);
 190   __ movslq(Rtmp, Rint);
 191   __ cmpq(Rtmp, Rint);
 192   __ jcc(Assembler::equal, L);
 193   __ stop("high 32-bits of int value are not 0");
 194   __ bind(L);
 195 #endif
 196 }
 197 
 198 
 199 //  Generate overlap test for array copy stubs
 200 //
 201 //  Input:
 202 //     c_rarg0 - from
 203 //     c_rarg1 - to
 204 //     c_rarg2 - element count
 205 //
 206 //  Output:
 207 //     rax   - &from[element count - 1]
 208 //
 209 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
 210   const Register from     = c_rarg0;
 211   const Register to       = c_rarg1;
 212   const Register count    = c_rarg2;
 213   const Register end_from = rax;
 214 
 215   __ cmpptr(to, from);
 216   __ lea(end_from, Address(from, count, sf, 0));
 217   if (NOLp == nullptr) {
 218     RuntimeAddress no_overlap(no_overlap_target);
 219     __ jump_cc(Assembler::belowEqual, no_overlap);
 220     __ cmpptr(to, end_from);
 221     __ jump_cc(Assembler::aboveEqual, no_overlap);
 222   } else {
 223     __ jcc(Assembler::belowEqual, (*NOLp));
 224     __ cmpptr(to, end_from);
 225     __ jcc(Assembler::aboveEqual, (*NOLp));
 226   }
 227 }
 228 
 229 
 230 // Copy big chunks forward
 231 //
 232 // Inputs:
 233 //   end_from     - source arrays end address
 234 //   end_to       - destination array end address
 235 //   qword_count  - 64-bits element count, negative
 236 //   tmp1         - scratch
 237 //   L_copy_bytes - entry label
 238 //   L_copy_8_bytes  - exit  label
 239 //
 240 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
 241                                        Register qword_count, Register tmp1,
 242                                        Register tmp2, Label& L_copy_bytes,
 243                                        Label& L_copy_8_bytes, DecoratorSet decorators,
 244                                        BasicType type) {
 245   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 246   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 247   Label L_loop;
 248   __ align(OptoLoopAlignment);
 249   if (UseUnalignedLoadStores) {
 250     Label L_end;
 251     __ BIND(L_loop);
 252     if (UseAVX >= 2) {
 253       bs->copy_load_at(_masm, decorators, type, 32,
 254                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 255                        tmp1, xmm1);
 256       bs->copy_store_at(_masm, decorators, type, 32,
 257                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 258                         tmp1, tmp2, xmm1);
 259 
 260       bs->copy_load_at(_masm, decorators, type, 32,
 261                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 262                        tmp1, xmm1);
 263       bs->copy_store_at(_masm, decorators, type, 32,
 264                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 265                         tmp1, tmp2, xmm1);
 266     } else {
 267       bs->copy_load_at(_masm, decorators, type, 16,
 268                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 269                        tmp1, xmm1);
 270       bs->copy_store_at(_masm, decorators, type, 16,
 271                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 272                         tmp1, tmp2, xmm1);
 273       bs->copy_load_at(_masm, decorators, type, 16,
 274                        xmm0, Address(end_from, qword_count, Address::times_8, -40),
 275                        tmp1, xmm1);
 276       bs->copy_store_at(_masm, decorators, type, 16,
 277                         Address(end_to, qword_count, Address::times_8, -40), xmm0,
 278                         tmp1, tmp2, xmm1);
 279       bs->copy_load_at(_masm, decorators, type, 16,
 280                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 281                        tmp1, xmm1);
 282       bs->copy_store_at(_masm, decorators, type, 16,
 283                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 284                         tmp1, tmp2, xmm1);
 285       bs->copy_load_at(_masm, decorators, type, 16,
 286                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 287                        tmp1, xmm1);
 288       bs->copy_store_at(_masm, decorators, type, 16,
 289                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 290                         tmp1, tmp2, xmm1);
 291     }
 292 
 293     __ BIND(L_copy_bytes);
 294     __ addptr(qword_count, 8);
 295     __ jcc(Assembler::lessEqual, L_loop);
 296     __ subptr(qword_count, 4);  // sub(8) and add(4)
 297     __ jcc(Assembler::greater, L_end);
 298     // Copy trailing 32 bytes
 299     if (UseAVX >= 2) {
 300       bs->copy_load_at(_masm, decorators, type, 32,
 301                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 302                        tmp1, xmm1);
 303       bs->copy_store_at(_masm, decorators, type, 32,
 304                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 305                         tmp1, tmp2, xmm1);
 306     } else {
 307       bs->copy_load_at(_masm, decorators, type, 16,
 308                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 309                        tmp1, xmm1);
 310       bs->copy_store_at(_masm, decorators, type, 16,
 311                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 312                         tmp1, tmp2, xmm1);
 313       bs->copy_load_at(_masm, decorators, type, 16,
 314                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 315                        tmp1, xmm1);
 316       bs->copy_store_at(_masm, decorators, type, 16,
 317                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 318                         tmp1, tmp2, xmm1);
 319     }
 320     __ addptr(qword_count, 4);
 321     __ BIND(L_end);
 322   } else {
 323     // Copy 32-bytes per iteration
 324     __ BIND(L_loop);
 325     bs->copy_load_at(_masm, decorators, type, 8,
 326                      tmp1, Address(end_from, qword_count, Address::times_8, -24),
 327                      tmp2);
 328     bs->copy_store_at(_masm, decorators, type, 8,
 329                       Address(end_to, qword_count, Address::times_8, -24), tmp1,
 330                       tmp2);
 331     bs->copy_load_at(_masm, decorators, type, 8,
 332                      tmp1, Address(end_from, qword_count, Address::times_8, -16),
 333                      tmp2);
 334     bs->copy_store_at(_masm, decorators, type, 8,
 335                       Address(end_to, qword_count, Address::times_8, -16), tmp1,
 336                       tmp2);
 337     bs->copy_load_at(_masm, decorators, type, 8,
 338                      tmp1, Address(end_from, qword_count, Address::times_8, -8),
 339                      tmp2);
 340     bs->copy_store_at(_masm, decorators, type, 8,
 341                       Address(end_to, qword_count, Address::times_8, -8), tmp1,
 342                       tmp2);
 343     bs->copy_load_at(_masm, decorators, type, 8,
 344                      tmp1, Address(end_from, qword_count, Address::times_8, 0),
 345                      tmp2);
 346     bs->copy_store_at(_masm, decorators, type, 8,
 347                       Address(end_to, qword_count, Address::times_8, 0), tmp1,
 348                       tmp2);
 349 
 350     __ BIND(L_copy_bytes);
 351     __ addptr(qword_count, 4);
 352     __ jcc(Assembler::lessEqual, L_loop);
 353   }
 354   __ subptr(qword_count, 4);
 355   __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
 356 }
 357 
 358 
 359 // Copy big chunks backward
 360 //
 361 // Inputs:
 362 //   from         - source arrays address
 363 //   dest         - destination array address
 364 //   qword_count  - 64-bits element count
 365 //   tmp1         - scratch
 366 //   L_copy_bytes - entry label
 367 //   L_copy_8_bytes  - exit  label
 368 //
 369 void StubGenerator::copy_bytes_backward(Register from, Register dest,
 370                                         Register qword_count, Register tmp1,
 371                                         Register tmp2, Label& L_copy_bytes,
 372                                         Label& L_copy_8_bytes, DecoratorSet decorators,
 373                                         BasicType type) {
 374   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 375   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 376   Label L_loop;
 377   __ align(OptoLoopAlignment);
 378   if (UseUnalignedLoadStores) {
 379     Label L_end;
 380     __ BIND(L_loop);
 381     if (UseAVX >= 2) {
 382       bs->copy_load_at(_masm, decorators, type, 32,
 383                        xmm0, Address(from, qword_count, Address::times_8, 32),
 384                        tmp1, xmm1);
 385       bs->copy_store_at(_masm, decorators, type, 32,
 386                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 387                         tmp1, tmp2, xmm1);
 388       bs->copy_load_at(_masm, decorators, type, 32,
 389                        xmm0, Address(from, qword_count, Address::times_8, 0),
 390                        tmp1, xmm1);
 391       bs->copy_store_at(_masm, decorators, type, 32,
 392                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 393                         tmp1, tmp2, xmm1);
 394     } else {
 395       bs->copy_load_at(_masm, decorators, type, 16,
 396                        xmm0, Address(from, qword_count, Address::times_8, 48),
 397                        tmp1, xmm1);
 398       bs->copy_store_at(_masm, decorators, type, 16,
 399                         Address(dest, qword_count, Address::times_8, 48), xmm0,
 400                         tmp1, tmp2, xmm1);
 401       bs->copy_load_at(_masm, decorators, type, 16,
 402                        xmm0, Address(from, qword_count, Address::times_8, 32),
 403                        tmp1, xmm1);
 404       bs->copy_store_at(_masm, decorators, type, 16,
 405                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 406                         tmp1, tmp2, xmm1);
 407       bs->copy_load_at(_masm, decorators, type, 16,
 408                        xmm0, Address(from, qword_count, Address::times_8, 16),
 409                        tmp1, xmm1);
 410       bs->copy_store_at(_masm, decorators, type, 16,
 411                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 412                         tmp1, tmp2, xmm1);
 413       bs->copy_load_at(_masm, decorators, type, 16,
 414                        xmm0, Address(from, qword_count, Address::times_8, 0),
 415                        tmp1, xmm1);
 416       bs->copy_store_at(_masm, decorators, type, 16,
 417                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 418                         tmp1, tmp2, xmm1);
 419     }
 420 
 421     __ BIND(L_copy_bytes);
 422     __ subptr(qword_count, 8);
 423     __ jcc(Assembler::greaterEqual, L_loop);
 424 
 425     __ addptr(qword_count, 4);  // add(8) and sub(4)
 426     __ jcc(Assembler::less, L_end);
 427     // Copy trailing 32 bytes
 428     if (UseAVX >= 2) {
 429       bs->copy_load_at(_masm, decorators, type, 32,
 430                        xmm0, Address(from, qword_count, Address::times_8, 0),
 431                        tmp1, xmm1);
 432       bs->copy_store_at(_masm, decorators, type, 32,
 433                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 434                         tmp1, tmp2, xmm1);
 435     } else {
 436       bs->copy_load_at(_masm, decorators, type, 16,
 437                        xmm0, Address(from, qword_count, Address::times_8, 16),
 438                        tmp1, xmm1);
 439       bs->copy_store_at(_masm, decorators, type, 16,
 440                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 441                         tmp1, tmp2, xmm1);
 442       bs->copy_load_at(_masm, decorators, type, 16,
 443                        xmm0, Address(from, qword_count, Address::times_8, 0),
 444                        tmp1, xmm1);
 445       bs->copy_store_at(_masm, decorators, type, 16,
 446                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 447                         tmp1, tmp2, xmm1);
 448     }
 449     __ subptr(qword_count, 4);
 450     __ BIND(L_end);
 451   } else {
 452     // Copy 32-bytes per iteration
 453     __ BIND(L_loop);
 454     bs->copy_load_at(_masm, decorators, type, 8,
 455                      tmp1, Address(from, qword_count, Address::times_8, 24),
 456                      tmp2);
 457     bs->copy_store_at(_masm, decorators, type, 8,
 458                       Address(dest, qword_count, Address::times_8, 24), tmp1,
 459                       tmp2);
 460     bs->copy_load_at(_masm, decorators, type, 8,
 461                      tmp1, Address(from, qword_count, Address::times_8, 16),
 462                      tmp2);
 463     bs->copy_store_at(_masm, decorators, type, 8,
 464                       Address(dest, qword_count, Address::times_8, 16), tmp1,
 465                       tmp2);
 466     bs->copy_load_at(_masm, decorators, type, 8,
 467                      tmp1, Address(from, qword_count, Address::times_8, 8),
 468                      tmp2);
 469     bs->copy_store_at(_masm, decorators, type, 8,
 470                       Address(dest, qword_count, Address::times_8, 8), tmp1,
 471                       tmp2);
 472     bs->copy_load_at(_masm, decorators, type, 8,
 473                      tmp1, Address(from, qword_count, Address::times_8, 0),
 474                      tmp2);
 475     bs->copy_store_at(_masm, decorators, type, 8,
 476                       Address(dest, qword_count, Address::times_8, 0), tmp1,
 477                       tmp2);
 478 
 479     __ BIND(L_copy_bytes);
 480     __ subptr(qword_count, 4);
 481     __ jcc(Assembler::greaterEqual, L_loop);
 482   }
 483   __ addptr(qword_count, 4);
 484   __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
 485 }
 486 
 487 #if COMPILER2_OR_JVMCI
 488 
 489 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
 490 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 491 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 492 //   default configuration.
 493 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 494 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 495 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 496 //   better performance for disjoint copies. For conjoint/backward copy vector based
 497 //   copy performs better.
 498 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
 499 //   64 byte vector registers (ZMMs).
 500 
 501 // Inputs:
 502 //   c_rarg0   - source array address
 503 //   c_rarg1   - destination array address
 504 //   c_rarg2   - element count, treated as ssize_t, can be zero
 505 //
 506 //
 507 // Side Effects:
 508 //   disjoint_copy_avx3_masked is set to the no-overlap entry point
 509 //   used by generate_conjoint_[byte/int/short/long]_copy().
 510 //
 511 address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const char *name,
 512                                                           int shift, bool aligned, bool is_oop,
 513                                                           bool dest_uninitialized) {
 514   __ align(CodeEntryAlignment);
 515   StubCodeMark mark(this, "StubRoutines", name);
 516   address start = __ pc();
 517 
 518   int avx3threshold = VM_Version::avx3_threshold();
 519   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 520   const int large_threshold = 2621440; // 2.5 MB
 521   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 522   Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 523   Label L_copy_large, L_finish;
 524   const Register from        = rdi;  // source array address
 525   const Register to          = rsi;  // destination array address
 526   const Register count       = rdx;  // elements count
 527   const Register temp1       = r8;
 528   const Register temp2       = r11;
 529   const Register temp3       = rax;
 530   const Register temp4       = rcx;
 531   // End pointers are inclusive, and if count is not zero they point
 532   // to the last unit copied:  end_to[0] := end_from[0]
 533 
 534   __ enter(); // required for proper stackwalking of RuntimeStub frame
 535   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 536 
 537   if (entry != nullptr) {
 538     *entry = __ pc();
 539      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 540     BLOCK_COMMENT("Entry:");
 541   }
 542 
 543   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 544   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 545 
 546   setup_argument_regs(type);
 547 
 548   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 549   if (dest_uninitialized) {
 550     decorators |= IS_DEST_UNINITIALIZED;
 551   }
 552   if (aligned) {
 553     decorators |= ARRAYCOPY_ALIGNED;
 554   }
 555   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 556   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 557 
 558   {
 559     // Type(shift)           byte(0), short(1), int(2),   long(3)
 560     int loop_size[]        = { 192,     96,       48,      24};
 561     int threshold[]        = { 4096,    2048,     1024,    512};
 562 
 563     // UnsafeMemoryAccess page error: continue after unsafe access
 564     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 565     // 'from', 'to' and 'count' are now valid
 566 
 567     // temp1 holds remaining count and temp4 holds running count used to compute
 568     // next address offset for start of to/from addresses (temp4 * scale).
 569     __ mov64(temp4, 0);
 570     __ movq(temp1, count);
 571 
 572     // Zero length check.
 573     __ BIND(L_tail);
 574     __ cmpq(temp1, 0);
 575     __ jcc(Assembler::lessEqual, L_exit);
 576 
 577     // Special cases using 32 byte [masked] vector copy operations.
 578     arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 579                                  temp4, temp3, use64byteVector, L_entry, L_exit);
 580 
 581     // PRE-MAIN-POST loop for aligned copy.
 582     __ BIND(L_entry);
 583 
 584     if (MaxVectorSize == 64) {
 585       __ movq(temp2, temp1);
 586       __ shlq(temp2, shift);
 587       __ cmpq(temp2, large_threshold);
 588       __ jcc(Assembler::greaterEqual, L_copy_large);
 589     }
 590     if (avx3threshold != 0) {
 591       __ cmpq(count, threshold[shift]);
 592       if (MaxVectorSize == 64) {
 593         // Copy using 64 byte vectors.
 594         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 595       } else {
 596         assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
 597         // REP MOVS offer a faster copy path.
 598         __ jcc(Assembler::greaterEqual, L_repmovs);
 599       }
 600     }
 601 
 602     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 603       // Partial copy to make dst address 32 byte aligned.
 604       __ movq(temp2, to);
 605       __ andq(temp2, 31);
 606       __ jcc(Assembler::equal, L_main_pre_loop);
 607 
 608       __ negptr(temp2);
 609       __ addq(temp2, 32);
 610       if (shift) {
 611         __ shrq(temp2, shift);
 612       }
 613       __ movq(temp3, temp2);
 614       copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
 615       __ movq(temp4, temp2);
 616       __ movq(temp1, count);
 617       __ subq(temp1, temp2);
 618 
 619       __ cmpq(temp1, loop_size[shift]);
 620       __ jcc(Assembler::less, L_tail);
 621 
 622       __ BIND(L_main_pre_loop);
 623       __ subq(temp1, loop_size[shift]);
 624 
 625       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 626       __ align32();
 627       __ BIND(L_main_loop);
 628          copy64_avx(to, from, temp4, xmm1, false, shift, 0);
 629          copy64_avx(to, from, temp4, xmm1, false, shift, 64);
 630          copy64_avx(to, from, temp4, xmm1, false, shift, 128);
 631          __ addptr(temp4, loop_size[shift]);
 632          __ subq(temp1, loop_size[shift]);
 633          __ jcc(Assembler::greater, L_main_loop);
 634 
 635       __ addq(temp1, loop_size[shift]);
 636 
 637       // Tail loop.
 638       __ jmp(L_tail);
 639 
 640       __ BIND(L_repmovs);
 641         __ movq(temp2, temp1);
 642         // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
 643         __ movq(temp3, to);
 644         __ movq(to,  from);
 645         __ movq(from, temp3);
 646         // Save to/from for restoration post rep_mov.
 647         __ movq(temp1, to);
 648         __ movq(temp3, from);
 649         if(shift < 3) {
 650           __ shrq(temp2, 3-shift);     // quad word count
 651         }
 652         __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
 653         __ rep_mov();
 654         __ shlq(temp2, 3);             // convert quad words into byte count.
 655         if(shift) {
 656           __ shrq(temp2, shift);       // type specific count.
 657         }
 658         // Restore original addresses in to/from.
 659         __ movq(to, temp3);
 660         __ movq(from, temp1);
 661         __ movq(temp4, temp2);
 662         __ movq(temp1, count);
 663         __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
 664         __ jmp(L_tail);
 665     }
 666 
 667     if (MaxVectorSize > 32) {
 668       __ BIND(L_pre_main_post_64);
 669       // Partial copy to make dst address 64 byte aligned.
 670       __ movq(temp2, to);
 671       __ andq(temp2, 63);
 672       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 673 
 674       __ negptr(temp2);
 675       __ addq(temp2, 64);
 676       if (shift) {
 677         __ shrq(temp2, shift);
 678       }
 679       __ movq(temp3, temp2);
 680       copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
 681       __ movq(temp4, temp2);
 682       __ movq(temp1, count);
 683       __ subq(temp1, temp2);
 684 
 685       __ cmpq(temp1, loop_size[shift]);
 686       __ jcc(Assembler::less, L_tail64);
 687 
 688       __ BIND(L_main_pre_loop_64bytes);
 689       __ subq(temp1, loop_size[shift]);
 690 
 691       // Main loop with aligned copy block size of 192 bytes at
 692       // 64 byte copy granularity.
 693       __ align32();
 694       __ BIND(L_main_loop_64bytes);
 695          copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
 696          copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
 697          copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
 698          __ addptr(temp4, loop_size[shift]);
 699          __ subq(temp1, loop_size[shift]);
 700          __ jcc(Assembler::greater, L_main_loop_64bytes);
 701 
 702       __ addq(temp1, loop_size[shift]);
 703       // Zero length check.
 704       __ jcc(Assembler::lessEqual, L_exit);
 705 
 706       __ BIND(L_tail64);
 707 
 708       // Tail handling using 64 byte [masked] vector copy operations.
 709       use64byteVector = true;
 710       arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 711                                    temp4, temp3, use64byteVector, L_entry, L_exit);
 712     }
 713     __ BIND(L_exit);
 714   }
 715 
 716   __ BIND(L_finish);
 717   address ucme_exit_pc = __ pc();
 718   // When called from generic_arraycopy r11 contains specific values
 719   // used during arraycopy epilogue, re-initializing r11.
 720   if (is_oop) {
 721     __ movq(r11, shift == 3 ? count : to);
 722   }
 723   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 724   restore_argument_regs(type);
 725   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 726   __ xorptr(rax, rax); // return 0
 727   __ vzeroupper();
 728   __ leave(); // required for proper stackwalking of RuntimeStub frame
 729   __ ret(0);
 730 
 731   if (MaxVectorSize == 64) {
 732     __ BIND(L_copy_large);
 733       UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
 734       arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
 735     __ jmp(L_finish);
 736   }
 737   return start;
 738 }
 739 
 740 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
 741                                          Register temp3, Register temp4, Register count,
 742                                          XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
 743                                          XMMRegister xmm4, int shift) {
 744 
 745   // Type(shift)           byte(0), short(1), int(2),   long(3)
 746   int loop_size[]        = { 256,     128,       64,      32};
 747   int threshold[]        = { 4096,    2048,     1024,    512};
 748 
 749   Label L_main_loop_large;
 750   Label L_tail_large;
 751   Label L_exit_large;
 752   Label L_entry_large;
 753   Label L_main_pre_loop_large;
 754   Label L_pre_main_post_large;
 755 
 756   assert(MaxVectorSize == 64, "vector length != 64");
 757   __ BIND(L_entry_large);
 758 
 759   __ BIND(L_pre_main_post_large);
 760   // Partial copy to make dst address 64 byte aligned.
 761   __ movq(temp2, to);
 762   __ andq(temp2, 63);
 763   __ jcc(Assembler::equal, L_main_pre_loop_large);
 764 
 765   __ negptr(temp2);
 766   __ addq(temp2, 64);
 767   if (shift) {
 768     __ shrq(temp2, shift);
 769   }
 770   __ movq(temp3, temp2);
 771   copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
 772   __ movq(temp4, temp2);
 773   __ movq(temp1, count);
 774   __ subq(temp1, temp2);
 775 
 776   __ cmpq(temp1, loop_size[shift]);
 777   __ jcc(Assembler::less, L_tail_large);
 778 
 779   __ BIND(L_main_pre_loop_large);
 780   __ subq(temp1, loop_size[shift]);
 781 
 782   // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
 783   __ align32();
 784   __ BIND(L_main_loop_large);
 785   copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
 786   __ addptr(temp4, loop_size[shift]);
 787   __ subq(temp1, loop_size[shift]);
 788   __ jcc(Assembler::greater, L_main_loop_large);
 789   // fence needed because copy256_avx3 uses non-temporal stores
 790   __ sfence();
 791 
 792   __ addq(temp1, loop_size[shift]);
 793   // Zero length check.
 794   __ jcc(Assembler::lessEqual, L_exit_large);
 795   __ BIND(L_tail_large);
 796   // Tail handling using 64 byte [masked] vector copy operations.
 797   __ cmpq(temp1, 0);
 798   __ jcc(Assembler::lessEqual, L_exit_large);
 799   arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
 800                                temp4, temp3, L_exit_large);
 801   __ BIND(L_exit_large);
 802 }
 803 
 804 // Inputs:
 805 //   c_rarg0   - source array address
 806 //   c_rarg1   - destination array address
 807 //   c_rarg2   - element count, treated as ssize_t, can be zero
 808 //
 809 //
 810 address StubGenerator::generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
 811                                                           address nooverlap_target, bool aligned,
 812                                                           bool is_oop, bool dest_uninitialized) {
 813   __ align(CodeEntryAlignment);
 814   StubCodeMark mark(this, "StubRoutines", name);
 815   address start = __ pc();
 816 
 817   int avx3threshold = VM_Version::avx3_threshold();
 818   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 819 
 820   Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 821   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 822   const Register from        = rdi;  // source array address
 823   const Register to          = rsi;  // destination array address
 824   const Register count       = rdx;  // elements count
 825   const Register temp1       = r8;
 826   const Register temp2       = rcx;
 827   const Register temp3       = r11;
 828   const Register temp4       = rax;
 829   // End pointers are inclusive, and if count is not zero they point
 830   // to the last unit copied:  end_to[0] := end_from[0]
 831 
 832   __ enter(); // required for proper stackwalking of RuntimeStub frame
 833   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 834 
 835   if (entry != nullptr) {
 836     *entry = __ pc();
 837      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 838     BLOCK_COMMENT("Entry:");
 839   }
 840 
 841   array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
 842 
 843   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 844   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 845 
 846   setup_argument_regs(type);
 847 
 848   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 849   if (dest_uninitialized) {
 850     decorators |= IS_DEST_UNINITIALIZED;
 851   }
 852   if (aligned) {
 853     decorators |= ARRAYCOPY_ALIGNED;
 854   }
 855   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 856   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 857   {
 858     // Type(shift)       byte(0), short(1), int(2),   long(3)
 859     int loop_size[]   = { 192,     96,       48,      24};
 860     int threshold[]   = { 4096,    2048,     1024,    512};
 861 
 862     // UnsafeMemoryAccess page error: continue after unsafe access
 863     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 864     // 'from', 'to' and 'count' are now valid
 865 
 866     // temp1 holds remaining count.
 867     __ movq(temp1, count);
 868 
 869     // Zero length check.
 870     __ BIND(L_tail);
 871     __ cmpq(temp1, 0);
 872     __ jcc(Assembler::lessEqual, L_exit);
 873 
 874     __ mov64(temp2, 0);
 875     __ movq(temp3, temp1);
 876     // Special cases using 32 byte [masked] vector copy operations.
 877     arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 878                                           temp4, use64byteVector, L_entry, L_exit);
 879 
 880     // PRE-MAIN-POST loop for aligned copy.
 881     __ BIND(L_entry);
 882 
 883     if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
 884       __ cmpq(temp1, threshold[shift]);
 885       __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 886     }
 887 
 888     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 889       // Partial copy to make dst address 32 byte aligned.
 890       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 891       __ andq(temp2, 31);
 892       __ jcc(Assembler::equal, L_main_pre_loop);
 893 
 894       if (shift) {
 895         __ shrq(temp2, shift);
 896       }
 897       __ subq(temp1, temp2);
 898       copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
 899 
 900       __ cmpq(temp1, loop_size[shift]);
 901       __ jcc(Assembler::less, L_tail);
 902 
 903       __ BIND(L_main_pre_loop);
 904 
 905       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 906       __ align32();
 907       __ BIND(L_main_loop);
 908          copy64_avx(to, from, temp1, xmm1, true, shift, -64);
 909          copy64_avx(to, from, temp1, xmm1, true, shift, -128);
 910          copy64_avx(to, from, temp1, xmm1, true, shift, -192);
 911          __ subptr(temp1, loop_size[shift]);
 912          __ cmpq(temp1, loop_size[shift]);
 913          __ jcc(Assembler::greater, L_main_loop);
 914 
 915       // Tail loop.
 916       __ jmp(L_tail);
 917     }
 918 
 919     if (MaxVectorSize > 32) {
 920       __ BIND(L_pre_main_post_64);
 921       // Partial copy to make dst address 64 byte aligned.
 922       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 923       __ andq(temp2, 63);
 924       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 925 
 926       if (shift) {
 927         __ shrq(temp2, shift);
 928       }
 929       __ subq(temp1, temp2);
 930       copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
 931 
 932       __ cmpq(temp1, loop_size[shift]);
 933       __ jcc(Assembler::less, L_tail64);
 934 
 935       __ BIND(L_main_pre_loop_64bytes);
 936 
 937       // Main loop with aligned copy block size of 192 bytes at
 938       // 64 byte copy granularity.
 939       __ align32();
 940       __ BIND(L_main_loop_64bytes);
 941          copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
 942          copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
 943          copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
 944          __ subq(temp1, loop_size[shift]);
 945          __ cmpq(temp1, loop_size[shift]);
 946          __ jcc(Assembler::greater, L_main_loop_64bytes);
 947 
 948       // Zero length check.
 949       __ cmpq(temp1, 0);
 950       __ jcc(Assembler::lessEqual, L_exit);
 951 
 952       __ BIND(L_tail64);
 953 
 954       // Tail handling using 64 byte [masked] vector copy operations.
 955       use64byteVector = true;
 956       __ mov64(temp2, 0);
 957       __ movq(temp3, temp1);
 958       arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 959                                             temp4, use64byteVector, L_entry, L_exit);
 960     }
 961     __ BIND(L_exit);
 962   }
 963   address ucme_exit_pc = __ pc();
 964   // When called from generic_arraycopy r11 contains specific values
 965   // used during arraycopy epilogue, re-initializing r11.
 966   if(is_oop) {
 967     __ movq(r11, count);
 968   }
 969   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 970   restore_argument_regs(type);
 971   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 972   __ xorptr(rax, rax); // return 0
 973   __ vzeroupper();
 974   __ leave(); // required for proper stackwalking of RuntimeStub frame
 975   __ ret(0);
 976 
 977   return start;
 978 }
 979 
 980 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
 981                                                  Register to, Register count, int shift,
 982                                                  Register index, Register temp,
 983                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
 984   Label L_entry_64, L_entry_96, L_entry_128;
 985   Label L_entry_160, L_entry_192;
 986 
 987   int size_mat[][6] = {
 988   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
 989   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
 990   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
 991   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
 992   };
 993 
 994   // Case A) Special case for length less than equal to 32 bytes.
 995   __ cmpq(count, size_mat[shift][0]);
 996   __ jccb(Assembler::greater, L_entry_64);
 997   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
 998   __ jmp(L_exit);
 999 
1000   // Case B) Special case for length less than equal to 64 bytes.
1001   __ BIND(L_entry_64);
1002   __ cmpq(count, size_mat[shift][1]);
1003   __ jccb(Assembler::greater, L_entry_96);
1004   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1005   __ jmp(L_exit);
1006 
1007   // Case C) Special case for length less than equal to 96 bytes.
1008   __ BIND(L_entry_96);
1009   __ cmpq(count, size_mat[shift][2]);
1010   __ jccb(Assembler::greater, L_entry_128);
1011   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1012   __ subq(count, 64 >> shift);
1013   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1014   __ jmp(L_exit);
1015 
1016   // Case D) Special case for length less than equal to 128 bytes.
1017   __ BIND(L_entry_128);
1018   __ cmpq(count, size_mat[shift][3]);
1019   __ jccb(Assembler::greater, L_entry_160);
1020   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1021   copy32_avx(to, from, index, xmm, shift, 64);
1022   __ subq(count, 96 >> shift);
1023   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1024   __ jmp(L_exit);
1025 
1026   // Case E) Special case for length less than equal to 160 bytes.
1027   __ BIND(L_entry_160);
1028   __ cmpq(count, size_mat[shift][4]);
1029   __ jccb(Assembler::greater, L_entry_192);
1030   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1031   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1032   __ subq(count, 128 >> shift);
1033   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1034   __ jmp(L_exit);
1035 
1036   // Case F) Special case for length less than equal to 192 bytes.
1037   __ BIND(L_entry_192);
1038   __ cmpq(count, size_mat[shift][5]);
1039   __ jcc(Assembler::greater, L_entry);
1040   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1041   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1042   copy32_avx(to, from, index, xmm, shift, 128);
1043   __ subq(count, 160 >> shift);
1044   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1045   __ jmp(L_exit);
1046 }
1047 
1048 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1049                                                      Register to, Register count, int shift, Register index,
1050                                                      Register temp, Label& L_exit) {
1051   Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1052 
1053   int size_mat[][4] = {
1054   /* T_BYTE */ {64, 128, 192, 256},
1055   /* T_SHORT*/ {32, 64 , 96 , 128},
1056   /* T_INT  */ {16, 32 , 48 ,  64},
1057   /* T_LONG */ { 8, 16 , 24 ,  32}
1058   };
1059 
1060   assert(MaxVectorSize == 64, "vector length != 64");
1061   // Case A) Special case for length less than or equal to 64 bytes.
1062   __ BIND(L_entry_64);
1063   __ cmpq(count, size_mat[shift][0]);
1064   __ jccb(Assembler::greater, L_entry_128);
1065   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1066   __ jmp(L_exit);
1067 
1068   // Case B) Special case for length less than or equal to 128 bytes.
1069   __ BIND(L_entry_128);
1070   __ cmpq(count, size_mat[shift][1]);
1071   __ jccb(Assembler::greater, L_entry_192);
1072   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1073   __ subq(count, 64 >> shift);
1074   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1075   __ jmp(L_exit);
1076 
1077   // Case C) Special case for length less than or equal to 192 bytes.
1078   __ BIND(L_entry_192);
1079   __ cmpq(count, size_mat[shift][2]);
1080   __ jcc(Assembler::greater, L_entry_256);
1081   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1082   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1083   __ subq(count, 128 >> shift);
1084   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1085   __ jmp(L_exit);
1086 
1087   // Case D) Special case for length less than or equal to 256 bytes.
1088   __ BIND(L_entry_256);
1089   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1090   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1091   copy64_avx(to, from, index, xmm, false, shift, 128, true);
1092   __ subq(count, 192 >> shift);
1093   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1094   __ jmp(L_exit);
1095 }
1096 
1097 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1098                                                            Register to, Register start_index, Register end_index,
1099                                                            Register count, int shift, Register temp,
1100                                                            bool use64byteVector, Label& L_entry, Label& L_exit) {
1101   Label L_entry_64, L_entry_96, L_entry_128;
1102   Label L_entry_160, L_entry_192;
1103   bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
1104 
1105   int size_mat[][6] = {
1106   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1107   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1108   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1109   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1110   };
1111 
1112   // Case A) Special case for length less than equal to 32 bytes.
1113   __ cmpq(count, size_mat[shift][0]);
1114   __ jccb(Assembler::greater, L_entry_64);
1115   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1116   __ jmp(L_exit);
1117 
1118   // Case B) Special case for length less than equal to 64 bytes.
1119   __ BIND(L_entry_64);
1120   __ cmpq(count, size_mat[shift][1]);
1121   __ jccb(Assembler::greater, L_entry_96);
1122   if (avx3) {
1123      copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1124   } else {
1125      copy32_avx(to, from, end_index, xmm, shift, -32);
1126      __ subq(count, 32 >> shift);
1127      copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1128   }
1129   __ jmp(L_exit);
1130 
1131   // Case C) Special case for length less than equal to 96 bytes.
1132   __ BIND(L_entry_96);
1133   __ cmpq(count, size_mat[shift][2]);
1134   __ jccb(Assembler::greater, L_entry_128);
1135   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1136   __ subq(count, 64 >> shift);
1137   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1138   __ jmp(L_exit);
1139 
1140   // Case D) Special case for length less than equal to 128 bytes.
1141   __ BIND(L_entry_128);
1142   __ cmpq(count, size_mat[shift][3]);
1143   __ jccb(Assembler::greater, L_entry_160);
1144   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1145   copy32_avx(to, from, end_index, xmm, shift, -96);
1146   __ subq(count, 96 >> shift);
1147   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1148   __ jmp(L_exit);
1149 
1150   // Case E) Special case for length less than equal to 160 bytes.
1151   __ BIND(L_entry_160);
1152   __ cmpq(count, size_mat[shift][4]);
1153   __ jccb(Assembler::greater, L_entry_192);
1154   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1155   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1156   __ subq(count, 128 >> shift);
1157   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1158   __ jmp(L_exit);
1159 
1160   // Case F) Special case for length less than equal to 192 bytes.
1161   __ BIND(L_entry_192);
1162   __ cmpq(count, size_mat[shift][5]);
1163   __ jcc(Assembler::greater, L_entry);
1164   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1165   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1166   copy32_avx(to, from, end_index, xmm, shift, -160);
1167   __ subq(count, 160 >> shift);
1168   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1169   __ jmp(L_exit);
1170 }
1171 
1172 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1173                                 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1174                                 int shift, int offset) {
1175   if (MaxVectorSize == 64) {
1176     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1177     __ prefetcht0(Address(src, index, scale, offset + 0x200));
1178     __ prefetcht0(Address(src, index, scale, offset + 0x240));
1179     __ prefetcht0(Address(src, index, scale, offset + 0x280));
1180     __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1181 
1182     __ prefetcht0(Address(src, index, scale, offset + 0x400));
1183     __ prefetcht0(Address(src, index, scale, offset + 0x440));
1184     __ prefetcht0(Address(src, index, scale, offset + 0x480));
1185     __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1186 
1187     __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1188     __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1189     __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1190     __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1191 
1192     __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1193     __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1194     __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1195     __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1196   }
1197 }
1198 
1199 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1200                                        KRegister mask, Register length, Register index,
1201                                        Register temp, int shift, int offset,
1202                                        bool use64byteVector) {
1203   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1204   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1205   if (!use64byteVector) {
1206     copy32_avx(dst, src, index, xmm, shift, offset);
1207     __ subptr(length, 32 >> shift);
1208     copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1209   } else {
1210     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1211     assert(MaxVectorSize == 64, "vector length != 64");
1212     __ mov64(temp, -1L);
1213     __ bzhiq(temp, temp, length);
1214     __ kmovql(mask, temp);
1215     __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1216     __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1217   }
1218 }
1219 
1220 
1221 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1222                                        KRegister mask, Register length, Register index,
1223                                        Register temp, int shift, int offset) {
1224   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1225   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1226   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1227   __ mov64(temp, -1L);
1228   __ bzhiq(temp, temp, length);
1229   __ kmovql(mask, temp);
1230   __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1231   __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1232 }
1233 
1234 
1235 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1236                                 int shift, int offset) {
1237   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1238   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1239   __ vmovdqu(xmm, Address(src, index, scale, offset));
1240   __ vmovdqu(Address(dst, index, scale, offset), xmm);
1241 }
1242 
1243 
1244 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1245                                 bool conjoint, int shift, int offset, bool use64byteVector) {
1246   assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1247   if (!use64byteVector) {
1248     if (conjoint) {
1249       copy32_avx(dst, src, index, xmm, shift, offset+32);
1250       copy32_avx(dst, src, index, xmm, shift, offset);
1251     } else {
1252       copy32_avx(dst, src, index, xmm, shift, offset);
1253       copy32_avx(dst, src, index, xmm, shift, offset+32);
1254     }
1255   } else {
1256     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1257     __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1258     __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1259   }
1260 }
1261 
1262 #endif // COMPILER2_OR_JVMCI
1263 
1264 
1265 // Arguments:
1266 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1267 //             ignored
1268 //   name    - stub name string
1269 //
1270 // Inputs:
1271 //   c_rarg0   - source array address
1272 //   c_rarg1   - destination array address
1273 //   c_rarg2   - element count, treated as ssize_t, can be zero
1274 //
1275 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1276 // we let the hardware handle it.  The one to eight bytes within words,
1277 // dwords or qwords that span cache line boundaries will still be loaded
1278 // and stored atomically.
1279 //
1280 // Side Effects:
1281 //   disjoint_byte_copy_entry is set to the no-overlap entry point
1282 //   used by generate_conjoint_byte_copy().
1283 //
1284 address StubGenerator::generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1285 #if COMPILER2_OR_JVMCI
1286   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1287      return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1288                                                aligned, false, false);
1289   }
1290 #endif
1291   __ align(CodeEntryAlignment);
1292   StubCodeMark mark(this, "StubRoutines", name);
1293   address start = __ pc();
1294   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1295 
1296   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1297   Label L_copy_byte, L_exit;
1298   const Register from        = rdi;  // source array address
1299   const Register to          = rsi;  // destination array address
1300   const Register count       = rdx;  // elements count
1301   const Register byte_count  = rcx;
1302   const Register qword_count = count;
1303   const Register end_from    = from; // source array end address
1304   const Register end_to      = to;   // destination array end address
1305   // End pointers are inclusive, and if count is not zero they point
1306   // to the last unit copied:  end_to[0] := end_from[0]
1307 
1308   __ enter(); // required for proper stackwalking of RuntimeStub frame
1309   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1310 
1311   if (entry != nullptr) {
1312     *entry = __ pc();
1313      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1314     BLOCK_COMMENT("Entry:");
1315   }
1316 
1317   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1318                     // r9 and r10 may be used to save non-volatile registers
1319 
1320   {
1321     // UnsafeMemoryAccess page error: continue after unsafe access
1322     UnsafeMemoryAccessMark umam(this, !aligned, true);
1323     // 'from', 'to' and 'count' are now valid
1324     __ movptr(byte_count, count);
1325     __ shrptr(count, 3); // count => qword_count
1326 
1327     // Copy from low to high addresses.  Use 'to' as scratch.
1328     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1329     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1330     __ negptr(qword_count); // make the count negative
1331     __ jmp(L_copy_bytes);
1332 
1333     // Copy trailing qwords
1334   __ BIND(L_copy_8_bytes);
1335     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1336     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1337     __ increment(qword_count);
1338     __ jcc(Assembler::notZero, L_copy_8_bytes);
1339 
1340     // Check for and copy trailing dword
1341   __ BIND(L_copy_4_bytes);
1342     __ testl(byte_count, 4);
1343     __ jccb(Assembler::zero, L_copy_2_bytes);
1344     __ movl(rax, Address(end_from, 8));
1345     __ movl(Address(end_to, 8), rax);
1346 
1347     __ addptr(end_from, 4);
1348     __ addptr(end_to, 4);
1349 
1350     // Check for and copy trailing word
1351   __ BIND(L_copy_2_bytes);
1352     __ testl(byte_count, 2);
1353     __ jccb(Assembler::zero, L_copy_byte);
1354     __ movw(rax, Address(end_from, 8));
1355     __ movw(Address(end_to, 8), rax);
1356 
1357     __ addptr(end_from, 2);
1358     __ addptr(end_to, 2);
1359 
1360     // Check for and copy trailing byte
1361   __ BIND(L_copy_byte);
1362     __ testl(byte_count, 1);
1363     __ jccb(Assembler::zero, L_exit);
1364     __ movb(rax, Address(end_from, 8));
1365     __ movb(Address(end_to, 8), rax);
1366   }
1367 __ BIND(L_exit);
1368   address ucme_exit_pc = __ pc();
1369   restore_arg_regs();
1370   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1371   __ xorptr(rax, rax); // return 0
1372   __ vzeroupper();
1373   __ leave(); // required for proper stackwalking of RuntimeStub frame
1374   __ ret(0);
1375 
1376   {
1377     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1378     // Copy in multi-bytes chunks
1379     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1380     __ jmp(L_copy_4_bytes);
1381   }
1382   return start;
1383 }
1384 
1385 
1386 // Arguments:
1387 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1388 //             ignored
1389 //   name    - stub name string
1390 //
1391 // Inputs:
1392 //   c_rarg0   - source array address
1393 //   c_rarg1   - destination array address
1394 //   c_rarg2   - element count, treated as ssize_t, can be zero
1395 //
1396 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1397 // we let the hardware handle it.  The one to eight bytes within words,
1398 // dwords or qwords that span cache line boundaries will still be loaded
1399 // and stored atomically.
1400 //
1401 address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1402                                                    address* entry, const char *name) {
1403 #if COMPILER2_OR_JVMCI
1404   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1405      return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1406                                                nooverlap_target, aligned, false, false);
1407   }
1408 #endif
1409   __ align(CodeEntryAlignment);
1410   StubCodeMark mark(this, "StubRoutines", name);
1411   address start = __ pc();
1412   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1413 
1414   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1415   const Register from        = rdi;  // source array address
1416   const Register to          = rsi;  // destination array address
1417   const Register count       = rdx;  // elements count
1418   const Register byte_count  = rcx;
1419   const Register qword_count = count;
1420 
1421   __ enter(); // required for proper stackwalking of RuntimeStub frame
1422   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1423 
1424   if (entry != nullptr) {
1425     *entry = __ pc();
1426     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1427     BLOCK_COMMENT("Entry:");
1428   }
1429 
1430   array_overlap_test(nooverlap_target, Address::times_1);
1431   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1432                     // r9 and r10 may be used to save non-volatile registers
1433 
1434   {
1435     // UnsafeMemoryAccess page error: continue after unsafe access
1436     UnsafeMemoryAccessMark umam(this, !aligned, true);
1437     // 'from', 'to' and 'count' are now valid
1438     __ movptr(byte_count, count);
1439     __ shrptr(count, 3);   // count => qword_count
1440 
1441     // Copy from high to low addresses.
1442 
1443     // Check for and copy trailing byte
1444     __ testl(byte_count, 1);
1445     __ jcc(Assembler::zero, L_copy_2_bytes);
1446     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1447     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1448     __ decrement(byte_count); // Adjust for possible trailing word
1449 
1450     // Check for and copy trailing word
1451   __ BIND(L_copy_2_bytes);
1452     __ testl(byte_count, 2);
1453     __ jcc(Assembler::zero, L_copy_4_bytes);
1454     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1455     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1456 
1457     // Check for and copy trailing dword
1458   __ BIND(L_copy_4_bytes);
1459     __ testl(byte_count, 4);
1460     __ jcc(Assembler::zero, L_copy_bytes);
1461     __ movl(rax, Address(from, qword_count, Address::times_8));
1462     __ movl(Address(to, qword_count, Address::times_8), rax);
1463     __ jmp(L_copy_bytes);
1464 
1465     // Copy trailing qwords
1466   __ BIND(L_copy_8_bytes);
1467     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1468     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1469     __ decrement(qword_count);
1470     __ jcc(Assembler::notZero, L_copy_8_bytes);
1471   }
1472   restore_arg_regs();
1473   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1474   __ xorptr(rax, rax); // return 0
1475   __ vzeroupper();
1476   __ leave(); // required for proper stackwalking of RuntimeStub frame
1477   __ ret(0);
1478 
1479   {
1480     // UnsafeMemoryAccess page error: continue after unsafe access
1481     UnsafeMemoryAccessMark umam(this, !aligned, true);
1482     // Copy in multi-bytes chunks
1483     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1484   }
1485   restore_arg_regs();
1486   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1487   __ xorptr(rax, rax); // return 0
1488   __ vzeroupper();
1489   __ leave(); // required for proper stackwalking of RuntimeStub frame
1490   __ ret(0);
1491 
1492   return start;
1493 }
1494 
1495 
1496 // Arguments:
1497 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1498 //             ignored
1499 //   name    - stub name string
1500 //
1501 // Inputs:
1502 //   c_rarg0   - source array address
1503 //   c_rarg1   - destination array address
1504 //   c_rarg2   - element count, treated as ssize_t, can be zero
1505 //
1506 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1507 // let the hardware handle it.  The two or four words within dwords
1508 // or qwords that span cache line boundaries will still be loaded
1509 // and stored atomically.
1510 //
1511 // Side Effects:
1512 //   disjoint_short_copy_entry is set to the no-overlap entry point
1513 //   used by generate_conjoint_short_copy().
1514 //
1515 address StubGenerator::generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1516 #if COMPILER2_OR_JVMCI
1517   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1518      return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
1519                                                aligned, false, false);
1520   }
1521 #endif
1522 
1523   __ align(CodeEntryAlignment);
1524   StubCodeMark mark(this, "StubRoutines", name);
1525   address start = __ pc();
1526   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1527 
1528   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1529   const Register from        = rdi;  // source array address
1530   const Register to          = rsi;  // destination array address
1531   const Register count       = rdx;  // elements count
1532   const Register word_count  = rcx;
1533   const Register qword_count = count;
1534   const Register end_from    = from; // source array end address
1535   const Register end_to      = to;   // destination array end address
1536   // End pointers are inclusive, and if count is not zero they point
1537   // to the last unit copied:  end_to[0] := end_from[0]
1538 
1539   __ enter(); // required for proper stackwalking of RuntimeStub frame
1540   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1541 
1542   if (entry != nullptr) {
1543     *entry = __ pc();
1544     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1545     BLOCK_COMMENT("Entry:");
1546   }
1547 
1548   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1549                     // r9 and r10 may be used to save non-volatile registers
1550 
1551   {
1552     // UnsafeMemoryAccess page error: continue after unsafe access
1553     UnsafeMemoryAccessMark umam(this, !aligned, true);
1554     // 'from', 'to' and 'count' are now valid
1555     __ movptr(word_count, count);
1556     __ shrptr(count, 2); // count => qword_count
1557 
1558     // Copy from low to high addresses.  Use 'to' as scratch.
1559     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1560     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1561     __ negptr(qword_count);
1562     __ jmp(L_copy_bytes);
1563 
1564     // Copy trailing qwords
1565   __ BIND(L_copy_8_bytes);
1566     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1567     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1568     __ increment(qword_count);
1569     __ jcc(Assembler::notZero, L_copy_8_bytes);
1570 
1571     // Original 'dest' is trashed, so we can't use it as a
1572     // base register for a possible trailing word copy
1573 
1574     // Check for and copy trailing dword
1575   __ BIND(L_copy_4_bytes);
1576     __ testl(word_count, 2);
1577     __ jccb(Assembler::zero, L_copy_2_bytes);
1578     __ movl(rax, Address(end_from, 8));
1579     __ movl(Address(end_to, 8), rax);
1580 
1581     __ addptr(end_from, 4);
1582     __ addptr(end_to, 4);
1583 
1584     // Check for and copy trailing word
1585   __ BIND(L_copy_2_bytes);
1586     __ testl(word_count, 1);
1587     __ jccb(Assembler::zero, L_exit);
1588     __ movw(rax, Address(end_from, 8));
1589     __ movw(Address(end_to, 8), rax);
1590   }
1591 __ BIND(L_exit);
1592   address ucme_exit_pc = __ pc();
1593   restore_arg_regs();
1594   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1595   __ xorptr(rax, rax); // return 0
1596   __ vzeroupper();
1597   __ leave(); // required for proper stackwalking of RuntimeStub frame
1598   __ ret(0);
1599 
1600   {
1601     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1602     // Copy in multi-bytes chunks
1603     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1604     __ jmp(L_copy_4_bytes);
1605   }
1606 
1607   return start;
1608 }
1609 
1610 
1611 address StubGenerator::generate_fill(BasicType t, bool aligned, const char *name) {
1612   __ align(CodeEntryAlignment);
1613   StubCodeMark mark(this, "StubRoutines", name);
1614   address start = __ pc();
1615 
1616   BLOCK_COMMENT("Entry:");
1617 
1618   const Register to       = c_rarg0;  // destination array address
1619   const Register value    = c_rarg1;  // value
1620   const Register count    = c_rarg2;  // elements count
1621   __ mov(r11, count);
1622 
1623   __ enter(); // required for proper stackwalking of RuntimeStub frame
1624 
1625   {
1626     // Add set memory mark to protect against unsafe accesses faulting
1627     UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true);
1628     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1629   }
1630 
1631   __ vzeroupper();
1632   __ leave(); // required for proper stackwalking of RuntimeStub frame
1633   __ ret(0);
1634 
1635   return start;
1636 }
1637 
1638 
1639 // Arguments:
1640 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1641 //             ignored
1642 //   name    - stub name string
1643 //
1644 // Inputs:
1645 //   c_rarg0   - source array address
1646 //   c_rarg1   - destination array address
1647 //   c_rarg2   - element count, treated as ssize_t, can be zero
1648 //
1649 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1650 // let the hardware handle it.  The two or four words within dwords
1651 // or qwords that span cache line boundaries will still be loaded
1652 // and stored atomically.
1653 //
1654 address StubGenerator::generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1655                                                     address *entry, const char *name) {
1656 #if COMPILER2_OR_JVMCI
1657   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1658      return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
1659                                                nooverlap_target, aligned, false, false);
1660   }
1661 #endif
1662   __ align(CodeEntryAlignment);
1663   StubCodeMark mark(this, "StubRoutines", name);
1664   address start = __ pc();
1665   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1666 
1667   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1668   const Register from        = rdi;  // source array address
1669   const Register to          = rsi;  // destination array address
1670   const Register count       = rdx;  // elements count
1671   const Register word_count  = rcx;
1672   const Register qword_count = count;
1673 
1674   __ enter(); // required for proper stackwalking of RuntimeStub frame
1675   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1676 
1677   if (entry != nullptr) {
1678     *entry = __ pc();
1679     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1680     BLOCK_COMMENT("Entry:");
1681   }
1682 
1683   array_overlap_test(nooverlap_target, Address::times_2);
1684   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1685                     // r9 and r10 may be used to save non-volatile registers
1686 
1687   {
1688     // UnsafeMemoryAccess page error: continue after unsafe access
1689     UnsafeMemoryAccessMark umam(this, !aligned, true);
1690     // 'from', 'to' and 'count' are now valid
1691     __ movptr(word_count, count);
1692     __ shrptr(count, 2); // count => qword_count
1693 
1694     // Copy from high to low addresses.  Use 'to' as scratch.
1695 
1696     // Check for and copy trailing word
1697     __ testl(word_count, 1);
1698     __ jccb(Assembler::zero, L_copy_4_bytes);
1699     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1700     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1701 
1702    // Check for and copy trailing dword
1703   __ BIND(L_copy_4_bytes);
1704     __ testl(word_count, 2);
1705     __ jcc(Assembler::zero, L_copy_bytes);
1706     __ movl(rax, Address(from, qword_count, Address::times_8));
1707     __ movl(Address(to, qword_count, Address::times_8), rax);
1708     __ jmp(L_copy_bytes);
1709 
1710     // Copy trailing qwords
1711   __ BIND(L_copy_8_bytes);
1712     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1713     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1714     __ decrement(qword_count);
1715     __ jcc(Assembler::notZero, L_copy_8_bytes);
1716   }
1717   restore_arg_regs();
1718   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1719   __ xorptr(rax, rax); // return 0
1720   __ vzeroupper();
1721   __ leave(); // required for proper stackwalking of RuntimeStub frame
1722   __ ret(0);
1723 
1724   {
1725     // UnsafeMemoryAccess page error: continue after unsafe access
1726     UnsafeMemoryAccessMark umam(this, !aligned, true);
1727     // Copy in multi-bytes chunks
1728     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1729   }
1730   restore_arg_regs();
1731   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1732   __ xorptr(rax, rax); // return 0
1733   __ vzeroupper();
1734   __ leave(); // required for proper stackwalking of RuntimeStub frame
1735   __ ret(0);
1736 
1737   return start;
1738 }
1739 
1740 
1741 // Arguments:
1742 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1743 //             ignored
1744 //   is_oop  - true => oop array, so generate store check code
1745 //   name    - stub name string
1746 //
1747 // Inputs:
1748 //   c_rarg0   - source array address
1749 //   c_rarg1   - destination array address
1750 //   c_rarg2   - element count, treated as ssize_t, can be zero
1751 //
1752 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1753 // the hardware handle it.  The two dwords within qwords that span
1754 // cache line boundaries will still be loaded and stored atomically.
1755 //
1756 // Side Effects:
1757 //   disjoint_int_copy_entry is set to the no-overlap entry point
1758 //   used by generate_conjoint_int_oop_copy().
1759 //
1760 address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1761                                                       const char *name, bool dest_uninitialized) {
1762   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1763 #if COMPILER2_OR_JVMCI
1764   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1765      return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
1766                                                aligned, is_oop, dest_uninitialized);
1767   }
1768 #endif
1769 
1770   __ align(CodeEntryAlignment);
1771   StubCodeMark mark(this, "StubRoutines", name);
1772   address start = __ pc();
1773 
1774   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1775   const Register from        = rdi;  // source array address
1776   const Register to          = rsi;  // destination array address
1777   const Register count       = rdx;  // elements count
1778   const Register dword_count = rcx;
1779   const Register qword_count = count;
1780   const Register end_from    = from; // source array end address
1781   const Register end_to      = to;   // destination array end address
1782   // End pointers are inclusive, and if count is not zero they point
1783   // to the last unit copied:  end_to[0] := end_from[0]
1784 
1785   __ enter(); // required for proper stackwalking of RuntimeStub frame
1786   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1787 
1788   if (entry != nullptr) {
1789     *entry = __ pc();
1790     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1791     BLOCK_COMMENT("Entry:");
1792   }
1793 
1794   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1795                                  // r9 is used to save r15_thread
1796 
1797   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1798   if (dest_uninitialized) {
1799     decorators |= IS_DEST_UNINITIALIZED;
1800   }
1801   if (aligned) {
1802     decorators |= ARRAYCOPY_ALIGNED;
1803   }
1804 
1805   BasicType type = is_oop ? T_OBJECT : T_INT;
1806   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1807 
1808   {
1809     // UnsafeMemoryAccess page error: continue after unsafe access
1810     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1811     // 'from', 'to' and 'count' are now valid
1812     __ movptr(dword_count, count);
1813     __ shrptr(count, 1); // count => qword_count
1814 
1815     // Copy from low to high addresses.  Use 'to' as scratch.
1816     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1817     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1818     __ negptr(qword_count);
1819     __ jmp(L_copy_bytes);
1820 
1821     // Copy trailing qwords
1822   __ BIND(L_copy_8_bytes);
1823     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1824     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1825     __ increment(qword_count);
1826     __ jcc(Assembler::notZero, L_copy_8_bytes);
1827 
1828     // Check for and copy trailing dword
1829   __ BIND(L_copy_4_bytes);
1830     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1831     __ jccb(Assembler::zero, L_exit);
1832     __ movl(rax, Address(end_from, 8));
1833     __ movl(Address(end_to, 8), rax);
1834   }
1835 __ BIND(L_exit);
1836   address ucme_exit_pc = __ pc();
1837   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1838   restore_arg_regs_using_thread();
1839   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1840   __ vzeroupper();
1841   __ xorptr(rax, rax); // return 0
1842   __ leave(); // required for proper stackwalking of RuntimeStub frame
1843   __ ret(0);
1844 
1845   {
1846     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
1847     // Copy in multi-bytes chunks
1848     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
1849     __ jmp(L_copy_4_bytes);
1850   }
1851 
1852   return start;
1853 }
1854 
1855 
1856 // Arguments:
1857 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1858 //             ignored
1859 //   is_oop  - true => oop array, so generate store check code
1860 //   name    - stub name string
1861 //
1862 // Inputs:
1863 //   c_rarg0   - source array address
1864 //   c_rarg1   - destination array address
1865 //   c_rarg2   - element count, treated as ssize_t, can be zero
1866 //
1867 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1868 // the hardware handle it.  The two dwords within qwords that span
1869 // cache line boundaries will still be loaded and stored atomically.
1870 //
1871 address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1872                                                       address *entry, const char *name,
1873                                                       bool dest_uninitialized) {
1874   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1875 #if COMPILER2_OR_JVMCI
1876   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1877      return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
1878                                                nooverlap_target, aligned, is_oop, dest_uninitialized);
1879   }
1880 #endif
1881   __ align(CodeEntryAlignment);
1882   StubCodeMark mark(this, "StubRoutines", name);
1883   address start = __ pc();
1884 
1885   Label L_copy_bytes, L_copy_8_bytes, L_exit;
1886   const Register from        = rdi;  // source array address
1887   const Register to          = rsi;  // destination array address
1888   const Register count       = rdx;  // elements count
1889   const Register dword_count = rcx;
1890   const Register qword_count = count;
1891 
1892   __ enter(); // required for proper stackwalking of RuntimeStub frame
1893   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1894 
1895   if (entry != nullptr) {
1896     *entry = __ pc();
1897      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1898     BLOCK_COMMENT("Entry:");
1899   }
1900 
1901   array_overlap_test(nooverlap_target, Address::times_4);
1902   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1903                                  // r9 is used to save r15_thread
1904 
1905   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1906   if (dest_uninitialized) {
1907     decorators |= IS_DEST_UNINITIALIZED;
1908   }
1909   if (aligned) {
1910     decorators |= ARRAYCOPY_ALIGNED;
1911   }
1912 
1913   BasicType type = is_oop ? T_OBJECT : T_INT;
1914   // no registers are destroyed by this call
1915   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1916 
1917   assert_clean_int(count, rax); // Make sure 'count' is clean int.
1918   {
1919     // UnsafeMemoryAccess page error: continue after unsafe access
1920     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1921     // 'from', 'to' and 'count' are now valid
1922     __ movptr(dword_count, count);
1923     __ shrptr(count, 1); // count => qword_count
1924 
1925     // Copy from high to low addresses.  Use 'to' as scratch.
1926 
1927     // Check for and copy trailing dword
1928     __ testl(dword_count, 1);
1929     __ jcc(Assembler::zero, L_copy_bytes);
1930     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1931     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1932     __ jmp(L_copy_bytes);
1933 
1934     // Copy trailing qwords
1935   __ BIND(L_copy_8_bytes);
1936     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1937     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1938     __ decrement(qword_count);
1939     __ jcc(Assembler::notZero, L_copy_8_bytes);
1940   }
1941   if (is_oop) {
1942     __ jmp(L_exit);
1943   }
1944   restore_arg_regs_using_thread();
1945   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1946   __ xorptr(rax, rax); // return 0
1947   __ vzeroupper();
1948   __ leave(); // required for proper stackwalking of RuntimeStub frame
1949   __ ret(0);
1950 
1951   {
1952     // UnsafeMemoryAccess page error: continue after unsafe access
1953     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1954     // Copy in multi-bytes chunks
1955     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
1956   }
1957 
1958 __ BIND(L_exit);
1959   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1960   restore_arg_regs_using_thread();
1961   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1962   __ xorptr(rax, rax); // return 0
1963   __ vzeroupper();
1964   __ leave(); // required for proper stackwalking of RuntimeStub frame
1965   __ ret(0);
1966 
1967   return start;
1968 }
1969 
1970 
1971 // Arguments:
1972 //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1973 //             ignored
1974 //   is_oop  - true => oop array, so generate store check code
1975 //   name    - stub name string
1976 //
1977 // Inputs:
1978 //   c_rarg0   - source array address
1979 //   c_rarg1   - destination array address
1980 //   c_rarg2   - element count, treated as ssize_t, can be zero
1981 //
1982  // Side Effects:
1983 //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1984 //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1985 //
1986 address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
1987                                                        const char *name, bool dest_uninitialized) {
1988   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1989 #if COMPILER2_OR_JVMCI
1990   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1991      return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
1992                                                aligned, is_oop, dest_uninitialized);
1993   }
1994 #endif
1995   __ align(CodeEntryAlignment);
1996   StubCodeMark mark(this, "StubRoutines", name);
1997   address start = __ pc();
1998 
1999   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2000   const Register from        = rdi;  // source array address
2001   const Register to          = rsi;  // destination array address
2002   const Register qword_count = rdx;  // elements count
2003   const Register end_from    = from; // source array end address
2004   const Register end_to      = rcx;  // destination array end address
2005   const Register saved_count = r11;
2006   // End pointers are inclusive, and if count is not zero they point
2007   // to the last unit copied:  end_to[0] := end_from[0]
2008 
2009   __ enter(); // required for proper stackwalking of RuntimeStub frame
2010   // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2011   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2012 
2013   if (entry != nullptr) {
2014     *entry = __ pc();
2015     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2016     BLOCK_COMMENT("Entry:");
2017   }
2018 
2019   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2020                                    // r9 is used to save r15_thread
2021   // 'from', 'to' and 'qword_count' are now valid
2022 
2023   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2024   if (dest_uninitialized) {
2025     decorators |= IS_DEST_UNINITIALIZED;
2026   }
2027   if (aligned) {
2028     decorators |= ARRAYCOPY_ALIGNED;
2029   }
2030 
2031   BasicType type = is_oop ? T_OBJECT : T_LONG;
2032   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2033   {
2034     // UnsafeMemoryAccess page error: continue after unsafe access
2035     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2036 
2037     // Copy from low to high addresses.  Use 'to' as scratch.
2038     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2039     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2040     __ negptr(qword_count);
2041     __ jmp(L_copy_bytes);
2042 
2043     // Copy trailing qwords
2044   __ BIND(L_copy_8_bytes);
2045     bs->copy_load_at(_masm, decorators, type, 8,
2046                      rax, Address(end_from, qword_count, Address::times_8, 8),
2047                      r10);
2048     bs->copy_store_at(_masm, decorators, type, 8,
2049                       Address(end_to, qword_count, Address::times_8, 8), rax,
2050                       r10);
2051     __ increment(qword_count);
2052     __ jcc(Assembler::notZero, L_copy_8_bytes);
2053   }
2054   if (is_oop) {
2055     __ jmp(L_exit);
2056   } else {
2057     restore_arg_regs_using_thread();
2058     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2059     __ xorptr(rax, rax); // return 0
2060     __ vzeroupper();
2061     __ leave(); // required for proper stackwalking of RuntimeStub frame
2062     __ ret(0);
2063   }
2064 
2065   {
2066     // UnsafeMemoryAccess page error: continue after unsafe access
2067     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2068     // Copy in multi-bytes chunks
2069     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2070   }
2071 
2072   __ BIND(L_exit);
2073   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2074   restore_arg_regs_using_thread();
2075   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2076                           SharedRuntime::_jlong_array_copy_ctr,
2077                  rscratch1); // Update counter after rscratch1 is free
2078   __ vzeroupper();
2079   __ xorptr(rax, rax); // return 0
2080   __ leave(); // required for proper stackwalking of RuntimeStub frame
2081   __ ret(0);
2082 
2083   return start;
2084 }
2085 
2086 
2087 // Arguments:
2088 //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2089 //             ignored
2090 //   is_oop  - true => oop array, so generate store check code
2091 //   name    - stub name string
2092 //
2093 // Inputs:
2094 //   c_rarg0   - source array address
2095 //   c_rarg1   - destination array address
2096 //   c_rarg2   - element count, treated as ssize_t, can be zero
2097 //
2098 address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2099                                                        address *entry, const char *name,
2100                                                        bool dest_uninitialized) {
2101   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2102 #if COMPILER2_OR_JVMCI
2103   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2104      return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2105                                                nooverlap_target, aligned, is_oop, dest_uninitialized);
2106   }
2107 #endif
2108   __ align(CodeEntryAlignment);
2109   StubCodeMark mark(this, "StubRoutines", name);
2110   address start = __ pc();
2111 
2112   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2113   const Register from        = rdi;  // source array address
2114   const Register to          = rsi;  // destination array address
2115   const Register qword_count = rdx;  // elements count
2116   const Register saved_count = rcx;
2117 
2118   __ enter(); // required for proper stackwalking of RuntimeStub frame
2119   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2120 
2121   if (entry != nullptr) {
2122     *entry = __ pc();
2123     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2124     BLOCK_COMMENT("Entry:");
2125   }
2126 
2127   array_overlap_test(nooverlap_target, Address::times_8);
2128   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2129                                  // r9 is used to save r15_thread
2130   // 'from', 'to' and 'qword_count' are now valid
2131 
2132   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2133   if (dest_uninitialized) {
2134     decorators |= IS_DEST_UNINITIALIZED;
2135   }
2136   if (aligned) {
2137     decorators |= ARRAYCOPY_ALIGNED;
2138   }
2139 
2140   BasicType type = is_oop ? T_OBJECT : T_LONG;
2141   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2142   {
2143     // UnsafeMemoryAccess page error: continue after unsafe access
2144     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2145 
2146     __ jmp(L_copy_bytes);
2147 
2148     // Copy trailing qwords
2149   __ BIND(L_copy_8_bytes);
2150     bs->copy_load_at(_masm, decorators, type, 8,
2151                      rax, Address(from, qword_count, Address::times_8, -8),
2152                      r10);
2153     bs->copy_store_at(_masm, decorators, type, 8,
2154                       Address(to, qword_count, Address::times_8, -8), rax,
2155                       r10);
2156     __ decrement(qword_count);
2157     __ jcc(Assembler::notZero, L_copy_8_bytes);
2158   }
2159   if (is_oop) {
2160     __ jmp(L_exit);
2161   } else {
2162     restore_arg_regs_using_thread();
2163     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2164     __ xorptr(rax, rax); // return 0
2165     __ vzeroupper();
2166     __ leave(); // required for proper stackwalking of RuntimeStub frame
2167     __ ret(0);
2168   }
2169   {
2170     // UnsafeMemoryAccess page error: continue after unsafe access
2171     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2172 
2173     // Copy in multi-bytes chunks
2174     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2175   }
2176   __ BIND(L_exit);
2177   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2178   restore_arg_regs_using_thread();
2179   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2180                           SharedRuntime::_jlong_array_copy_ctr,
2181                  rscratch1); // Update counter after rscratch1 is free
2182   __ vzeroupper();
2183   __ xorptr(rax, rax); // return 0
2184   __ leave(); // required for proper stackwalking of RuntimeStub frame
2185   __ ret(0);
2186 
2187   return start;
2188 }
2189 
2190 
2191 // Helper for generating a dynamic type check.
2192 // Smashes no registers.
2193 void StubGenerator::generate_type_check(Register sub_klass,
2194                                         Register super_check_offset,
2195                                         Register super_klass,
2196                                         Label& L_success) {
2197   assert_different_registers(sub_klass, super_check_offset, super_klass);
2198 
2199   BLOCK_COMMENT("type_check:");
2200 
2201   Label L_miss;
2202 
2203   __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
2204                                    super_check_offset);
2205   __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2206 
2207   // Fall through on failure!
2208   __ BIND(L_miss);
2209 }
2210 
2211 //
2212 //  Generate checkcasting array copy stub
2213 //
2214 //  Input:
2215 //    c_rarg0   - source array address
2216 //    c_rarg1   - destination array address
2217 //    c_rarg2   - element count, treated as ssize_t, can be zero
2218 //    c_rarg3   - size_t ckoff (super_check_offset)
2219 // not Win64
2220 //    c_rarg4   - oop ckval (super_klass)
2221 // Win64
2222 //    rsp+40    - oop ckval (super_klass)
2223 //
2224 //  Output:
2225 //    rax ==  0  -  success
2226 //    rax == -1^K - failure, where K is partial transfer count
2227 //
2228 address StubGenerator::generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized) {
2229 
2230   Label L_load_element, L_store_element, L_do_card_marks, L_done;
2231 
2232   // Input registers (after setup_arg_regs)
2233   const Register from        = rdi;   // source array address
2234   const Register to          = rsi;   // destination array address
2235   const Register length      = rdx;   // elements count
2236   const Register ckoff       = rcx;   // super_check_offset
2237   const Register ckval       = r8;    // super_klass
2238 
2239   // Registers used as temps (r13, r14 are save-on-entry)
2240   const Register end_from    = from;  // source array end address
2241   const Register end_to      = r13;   // destination array end address
2242   const Register count       = rdx;   // -(count_remaining)
2243   const Register r14_length  = r14;   // saved copy of length
2244   // End pointers are inclusive, and if length is not zero they point
2245   // to the last unit copied:  end_to[0] := end_from[0]
2246 
2247   const Register rax_oop    = rax;    // actual oop copied
2248   const Register r11_klass  = r11;    // oop._klass
2249 
2250   //---------------------------------------------------------------
2251   // Assembler stub will be used for this call to arraycopy
2252   // if the two arrays are subtypes of Object[] but the
2253   // destination array type is not equal to or a supertype
2254   // of the source type.  Each element must be separately
2255   // checked.
2256 
2257   __ align(CodeEntryAlignment);
2258   StubCodeMark mark(this, "StubRoutines", name);
2259   address start = __ pc();
2260 
2261   __ enter(); // required for proper stackwalking of RuntimeStub frame
2262 
2263 #ifdef ASSERT
2264   // caller guarantees that the arrays really are different
2265   // otherwise, we would have to make conjoint checks
2266   { Label L;
2267     array_overlap_test(L, TIMES_OOP);
2268     __ stop("checkcast_copy within a single array");
2269     __ bind(L);
2270   }
2271 #endif //ASSERT
2272 
2273   setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2274                                   // ckoff => rcx, ckval => r8
2275                                   // r9 is used to save r15_thread
2276 #ifdef _WIN64
2277   // last argument (#4) is on stack on Win64
2278   __ movptr(ckval, Address(rsp, 6 * wordSize));
2279 #endif
2280 
2281   // Caller of this entry point must set up the argument registers.
2282   if (entry != nullptr) {
2283     *entry = __ pc();
2284     BLOCK_COMMENT("Entry:");
2285   }
2286 
2287   // allocate spill slots for r13, r14
2288   enum {
2289     saved_r13_offset,
2290     saved_r14_offset,
2291     saved_r10_offset,
2292     saved_rbp_offset
2293   };
2294   __ subptr(rsp, saved_rbp_offset * wordSize);
2295   __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2296   __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2297   __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2298 
2299 #ifdef ASSERT
2300     Label L2;
2301     __ get_thread(r14);
2302     __ cmpptr(r15_thread, r14);
2303     __ jcc(Assembler::equal, L2);
2304     __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2305     __ bind(L2);
2306 #endif // ASSERT
2307 
2308   // check that int operands are properly extended to size_t
2309   assert_clean_int(length, rax);
2310   assert_clean_int(ckoff, rax);
2311 
2312 #ifdef ASSERT
2313   BLOCK_COMMENT("assert consistent ckoff/ckval");
2314   // The ckoff and ckval must be mutually consistent,
2315   // even though caller generates both.
2316   { Label L;
2317     int sco_offset = in_bytes(Klass::super_check_offset_offset());
2318     __ cmpl(ckoff, Address(ckval, sco_offset));
2319     __ jcc(Assembler::equal, L);
2320     __ stop("super_check_offset inconsistent");
2321     __ bind(L);
2322   }
2323 #endif //ASSERT
2324 
2325   // Loop-invariant addresses.  They are exclusive end pointers.
2326   Address end_from_addr(from, length, TIMES_OOP, 0);
2327   Address   end_to_addr(to,   length, TIMES_OOP, 0);
2328   // Loop-variant addresses.  They assume post-incremented count < 0.
2329   Address from_element_addr(end_from, count, TIMES_OOP, 0);
2330   Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2331 
2332   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2333   if (dest_uninitialized) {
2334     decorators |= IS_DEST_UNINITIALIZED;
2335   }
2336 
2337   BasicType type = T_OBJECT;
2338   size_t element_size = UseCompressedOops ? 4 : 8;
2339 
2340   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2341   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2342 
2343   // Copy from low to high addresses, indexed from the end of each array.
2344   __ lea(end_from, end_from_addr);
2345   __ lea(end_to,   end_to_addr);
2346   __ movptr(r14_length, length);        // save a copy of the length
2347   assert(length == count, "");          // else fix next line:
2348   __ negptr(count);                     // negate and test the length
2349   __ jcc(Assembler::notZero, L_load_element);
2350 
2351   // Empty array:  Nothing to do.
2352   __ xorptr(rax, rax);                  // return 0 on (trivial) success
2353   __ jmp(L_done);
2354 
2355   // ======== begin loop ========
2356   // (Loop is rotated; its entry is L_load_element.)
2357   // Loop control:
2358   //   for (count = -count; count != 0; count++)
2359   // Base pointers src, dst are biased by 8*(count-1),to last element.
2360   __ align(OptoLoopAlignment);
2361 
2362   __ BIND(L_store_element);
2363   bs->copy_store_at(_masm,
2364                     decorators,
2365                     type,
2366                     element_size,
2367                     to_element_addr,
2368                     rax_oop,
2369                     r10);
2370   __ increment(count);               // increment the count toward zero
2371   __ jcc(Assembler::zero, L_do_card_marks);
2372 
2373   // ======== loop entry is here ========
2374   __ BIND(L_load_element);
2375   bs->copy_load_at(_masm,
2376                    decorators,
2377                    type,
2378                    element_size,
2379                    rax_oop,
2380                    from_element_addr,
2381                    r10);
2382   __ testptr(rax_oop, rax_oop);
2383   __ jcc(Assembler::zero, L_store_element);
2384 
2385   __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2386   generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2387   // ======== end loop ========
2388 
2389   // It was a real error; we must depend on the caller to finish the job.
2390   // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2391   // Emit GC store barriers for the oops we have copied (r14 + rdx),
2392   // and report their number to the caller.
2393   assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2394   Label L_post_barrier;
2395   __ addptr(r14_length, count);     // K = (original - remaining) oops
2396   __ movptr(rax, r14_length);       // save the value
2397   __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2398   __ jccb(Assembler::notZero, L_post_barrier);
2399   __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2400 
2401   // Come here on success only.
2402   __ BIND(L_do_card_marks);
2403   __ xorptr(rax, rax);              // return 0 on success
2404 
2405   __ BIND(L_post_barrier);
2406   bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2407 
2408   // Common exit point (success or failure).
2409   __ BIND(L_done);
2410   __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2411   __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2412   __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2413   restore_arg_regs_using_thread();
2414   INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2415   __ leave(); // required for proper stackwalking of RuntimeStub frame
2416   __ ret(0);
2417 
2418   return start;
2419 }
2420 
2421 
2422 //  Generate 'unsafe' array copy stub
2423 //  Though just as safe as the other stubs, it takes an unscaled
2424 //  size_t argument instead of an element count.
2425 //
2426 //  Input:
2427 //    c_rarg0   - source array address
2428 //    c_rarg1   - destination array address
2429 //    c_rarg2   - byte count, treated as ssize_t, can be zero
2430 //
2431 // Examines the alignment of the operands and dispatches
2432 // to a long, int, short, or byte copy loop.
2433 //
2434 address StubGenerator::generate_unsafe_copy(const char *name,
2435                                             address byte_copy_entry, address short_copy_entry,
2436                                             address int_copy_entry, address long_copy_entry) {
2437 
2438   Label L_long_aligned, L_int_aligned, L_short_aligned;
2439 
2440   // Input registers (before setup_arg_regs)
2441   const Register from        = c_rarg0;  // source array address
2442   const Register to          = c_rarg1;  // destination array address
2443   const Register size        = c_rarg2;  // byte count (size_t)
2444 
2445   // Register used as a temp
2446   const Register bits        = rax;      // test copy of low bits
2447 
2448   __ align(CodeEntryAlignment);
2449   StubCodeMark mark(this, "StubRoutines", name);
2450   address start = __ pc();
2451 
2452   __ enter(); // required for proper stackwalking of RuntimeStub frame
2453 
2454   // bump this on entry, not on exit:
2455   INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
2456 
2457   __ mov(bits, from);
2458   __ orptr(bits, to);
2459   __ orptr(bits, size);
2460 
2461   __ testb(bits, BytesPerLong-1);
2462   __ jccb(Assembler::zero, L_long_aligned);
2463 
2464   __ testb(bits, BytesPerInt-1);
2465   __ jccb(Assembler::zero, L_int_aligned);
2466 
2467   __ testb(bits, BytesPerShort-1);
2468   __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2469 
2470   __ BIND(L_short_aligned);
2471   __ shrptr(size, LogBytesPerShort); // size => short_count
2472   __ jump(RuntimeAddress(short_copy_entry));
2473 
2474   __ BIND(L_int_aligned);
2475   __ shrptr(size, LogBytesPerInt); // size => int_count
2476   __ jump(RuntimeAddress(int_copy_entry));
2477 
2478   __ BIND(L_long_aligned);
2479   __ shrptr(size, LogBytesPerLong); // size => qword_count
2480   __ jump(RuntimeAddress(long_copy_entry));
2481 
2482   return start;
2483 }
2484 
2485 
2486 // Static enum for helper
2487 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
2488 // Helper for generate_unsafe_setmemory
2489 //
2490 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
2491 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
2492                                      Register size, Register wide_value,
2493                                      Register tmp, Label& L_exit,
2494                                      MacroAssembler *_masm) {
2495   Label L_Loop, L_Tail, L_TailLoop;
2496 
2497   int shiftval = 0;
2498   int incr = 0;
2499 
2500   switch (type) {
2501     case USM_SHORT:
2502       shiftval = 1;
2503       incr = 16;
2504       break;
2505     case USM_DWORD:
2506       shiftval = 2;
2507       incr = 32;
2508       break;
2509     case USM_QUADWORD:
2510       shiftval = 3;
2511       incr = 64;
2512       break;
2513   }
2514 
2515   // At this point, we know the lower bits of size are zero
2516   __ shrq(size, shiftval);
2517   // size now has number of X-byte chunks (2, 4 or 8)
2518 
2519   // Number of (8*X)-byte chunks into tmp
2520   __ movq(tmp, size);
2521   __ shrq(tmp, 3);
2522   __ jccb(Assembler::zero, L_Tail);
2523 
2524   __ BIND(L_Loop);
2525 
2526   // Unroll 8 stores
2527   for (int i = 0; i < 8; i++) {
2528     switch (type) {
2529       case USM_SHORT:
2530         __ movw(Address(dest, (2 * i)), wide_value);
2531         break;
2532       case USM_DWORD:
2533         __ movl(Address(dest, (4 * i)), wide_value);
2534         break;
2535       case USM_QUADWORD:
2536         __ movq(Address(dest, (8 * i)), wide_value);
2537         break;
2538     }
2539   }
2540   __ addq(dest, incr);
2541   __ decrementq(tmp);
2542   __ jccb(Assembler::notZero, L_Loop);
2543 
2544   __ BIND(L_Tail);
2545 
2546   // Find number of remaining X-byte chunks
2547   __ andq(size, 0x7);
2548 
2549   // If zero, then we're done
2550   __ jccb(Assembler::zero, L_exit);
2551 
2552   __ BIND(L_TailLoop);
2553 
2554     switch (type) {
2555       case USM_SHORT:
2556         __ movw(Address(dest, 0), wide_value);
2557         break;
2558       case USM_DWORD:
2559         __ movl(Address(dest, 0), wide_value);
2560         break;
2561       case USM_QUADWORD:
2562         __ movq(Address(dest, 0), wide_value);
2563         break;
2564     }
2565   __ addq(dest, incr >> 3);
2566   __ decrementq(size);
2567   __ jccb(Assembler::notZero, L_TailLoop);
2568 }
2569 
2570 //  Generate 'unsafe' set memory stub
2571 //  Though just as safe as the other stubs, it takes an unscaled
2572 //  size_t (# bytes) argument instead of an element count.
2573 //
2574 //  Input:
2575 //    c_rarg0   - destination array address
2576 //    c_rarg1   - byte count (size_t)
2577 //    c_rarg2   - byte value
2578 //
2579 // Examines the alignment of the operands and dispatches
2580 // to an int, short, or byte fill loop.
2581 //
2582 address StubGenerator::generate_unsafe_setmemory(const char *name,
2583                                                  address unsafe_byte_fill) {
2584   __ align(CodeEntryAlignment);
2585   StubCodeMark mark(this, "StubRoutines", name);
2586   address start = __ pc();
2587   __ enter();   // required for proper stackwalking of RuntimeStub frame
2588 
2589   assert(unsafe_byte_fill != nullptr, "Invalid call");
2590 
2591   // bump this on entry, not on exit:
2592   INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
2593 
2594   {
2595     Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
2596 
2597     const Register dest = c_rarg0;
2598     const Register size = c_rarg1;
2599     const Register byteVal = c_rarg2;
2600     const Register wide_value = rax;
2601     const Register rScratch1 = r10;
2602 
2603     assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
2604 
2605     //     fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2606 
2607     __ testq(size, size);
2608     __ jcc(Assembler::zero, L_exit);
2609 
2610     // Propagate byte to full Register
2611     __ movzbl(rScratch1, byteVal);
2612     __ mov64(wide_value, 0x0101010101010101ULL);
2613     __ imulq(wide_value, rScratch1);
2614 
2615     // Check for pointer & size alignment
2616     __ movq(rScratch1, dest);
2617     __ orq(rScratch1, size);
2618 
2619     __ testb(rScratch1, 7);
2620     __ jcc(Assembler::equal, L_fillQuadwords);
2621 
2622     __ testb(rScratch1, 3);
2623     __ jcc(Assembler::equal, L_fillDwords);
2624 
2625     __ testb(rScratch1, 1);
2626     __ jcc(Assembler::notEqual, L_fillBytes);
2627 
2628     // Fill words
2629     {
2630       UnsafeMemoryAccessMark umam(this, true, true);
2631 
2632       // At this point, we know the lower bit of size is zero and a
2633       // multiple of 2
2634       do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
2635                                L_exit, _masm);
2636     }
2637     __ jmpb(L_exit);
2638 
2639     __ BIND(L_fillQuadwords);
2640 
2641     // Fill QUADWORDs
2642     {
2643       UnsafeMemoryAccessMark umam(this, true, true);
2644 
2645       // At this point, we know the lower 3 bits of size are zero and a
2646       // multiple of 8
2647       do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
2648                                L_exit, _masm);
2649     }
2650     __ BIND(L_exit);
2651 
2652     __ leave();   // required for proper stackwalking of RuntimeStub frame
2653     __ ret(0);
2654 
2655     __ BIND(L_fillDwords);
2656 
2657     // Fill DWORDs
2658     {
2659       UnsafeMemoryAccessMark umam(this, true, true);
2660 
2661       // At this point, we know the lower 2 bits of size are zero and a
2662       // multiple of 4
2663       do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
2664                                L_exit, _masm);
2665     }
2666     __ jmpb(L_exit);
2667 
2668     __ BIND(L_fillBytes);
2669     // Set up for tail call to previously generated byte fill routine
2670     // Parameter order is (ptr, byteVal, size)
2671     __ xchgq(c_rarg1, c_rarg2);
2672     __ leave();    // Clear effect of enter()
2673     __ jump(RuntimeAddress(unsafe_byte_fill));
2674   }
2675 
2676   return start;
2677 }
2678 
2679 // Perform range checks on the proposed arraycopy.
2680 // Kills temp, but nothing else.
2681 // Also, clean the sign bits of src_pos and dst_pos.
2682 void StubGenerator::arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2683                                            Register src_pos, // source position (c_rarg1)
2684                                            Register dst,     // destination array oo (c_rarg2)
2685                                            Register dst_pos, // destination position (c_rarg3)
2686                                            Register length,
2687                                            Register temp,
2688                                            Label& L_failed) {
2689   BLOCK_COMMENT("arraycopy_range_checks:");
2690 
2691   //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2692   __ movl(temp, length);
2693   __ addl(temp, src_pos);             // src_pos + length
2694   __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2695   __ jcc(Assembler::above, L_failed);
2696 
2697   //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2698   __ movl(temp, length);
2699   __ addl(temp, dst_pos);             // dst_pos + length
2700   __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2701   __ jcc(Assembler::above, L_failed);
2702 
2703   // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2704   // Move with sign extension can be used since they are positive.
2705   __ movslq(src_pos, src_pos);
2706   __ movslq(dst_pos, dst_pos);
2707 
2708   BLOCK_COMMENT("arraycopy_range_checks done");
2709 }
2710 
2711 
2712 //  Generate generic array copy stubs
2713 //
2714 //  Input:
2715 //    c_rarg0    -  src oop
2716 //    c_rarg1    -  src_pos (32-bits)
2717 //    c_rarg2    -  dst oop
2718 //    c_rarg3    -  dst_pos (32-bits)
2719 // not Win64
2720 //    c_rarg4    -  element count (32-bits)
2721 // Win64
2722 //    rsp+40     -  element count (32-bits)
2723 //
2724 //  Output:
2725 //    rax ==  0  -  success
2726 //    rax == -1^K - failure, where K is partial transfer count
2727 //
2728 address StubGenerator::generate_generic_copy(const char *name,
2729                                              address byte_copy_entry, address short_copy_entry,
2730                                              address int_copy_entry, address oop_copy_entry,
2731                                              address long_copy_entry, address checkcast_copy_entry) {
2732 
2733   Label L_failed, L_failed_0, L_objArray;
2734   Label L_copy_shorts, L_copy_ints, L_copy_longs;
2735 
2736   // Input registers
2737   const Register src        = c_rarg0;  // source array oop
2738   const Register src_pos    = c_rarg1;  // source position
2739   const Register dst        = c_rarg2;  // destination array oop
2740   const Register dst_pos    = c_rarg3;  // destination position
2741 #ifndef _WIN64
2742   const Register length     = c_rarg4;
2743   const Register rklass_tmp = r9;  // load_klass
2744 #else
2745   const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
2746   const Register rklass_tmp = rdi;  // load_klass
2747 #endif
2748 
2749   { int modulus = CodeEntryAlignment;
2750     int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2751     int advance = target - (__ offset() % modulus);
2752     if (advance < 0)  advance += modulus;
2753     if (advance > 0)  __ nop(advance);
2754   }
2755   StubCodeMark mark(this, "StubRoutines", name);
2756 
2757   // Short-hop target to L_failed.  Makes for denser prologue code.
2758   __ BIND(L_failed_0);
2759   __ jmp(L_failed);
2760   assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2761 
2762   __ align(CodeEntryAlignment);
2763   address start = __ pc();
2764 
2765   __ enter(); // required for proper stackwalking of RuntimeStub frame
2766 
2767 #ifdef _WIN64
2768   __ push(rklass_tmp); // rdi is callee-save on Windows
2769 #endif
2770 
2771   // bump this on entry, not on exit:
2772   INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
2773 
2774   //-----------------------------------------------------------------------
2775   // Assembler stub will be used for this call to arraycopy
2776   // if the following conditions are met:
2777   //
2778   // (1) src and dst must not be null.
2779   // (2) src_pos must not be negative.
2780   // (3) dst_pos must not be negative.
2781   // (4) length  must not be negative.
2782   // (5) src klass and dst klass should be the same and not null.
2783   // (6) src and dst should be arrays.
2784   // (7) src_pos + length must not exceed length of src.
2785   // (8) dst_pos + length must not exceed length of dst.
2786   //
2787 
2788   //  if (src == nullptr) return -1;
2789   __ testptr(src, src);         // src oop
2790   size_t j1off = __ offset();
2791   __ jccb(Assembler::zero, L_failed_0);
2792 
2793   //  if (src_pos < 0) return -1;
2794   __ testl(src_pos, src_pos); // src_pos (32-bits)
2795   __ jccb(Assembler::negative, L_failed_0);
2796 
2797   //  if (dst == nullptr) return -1;
2798   __ testptr(dst, dst);         // dst oop
2799   __ jccb(Assembler::zero, L_failed_0);
2800 
2801   //  if (dst_pos < 0) return -1;
2802   __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2803   size_t j4off = __ offset();
2804   __ jccb(Assembler::negative, L_failed_0);
2805 
2806   // The first four tests are very dense code,
2807   // but not quite dense enough to put four
2808   // jumps in a 16-byte instruction fetch buffer.
2809   // That's good, because some branch predicters
2810   // do not like jumps so close together.
2811   // Make sure of this.
2812   guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2813 
2814   // registers used as temp
2815   const Register r11_length    = r11; // elements count to copy
2816   const Register r10_src_klass = r10; // array klass
2817 
2818   //  if (length < 0) return -1;
2819   __ movl(r11_length, length);        // length (elements count, 32-bits value)
2820   __ testl(r11_length, r11_length);
2821   __ jccb(Assembler::negative, L_failed_0);
2822 
2823   __ load_klass(r10_src_klass, src, rklass_tmp);
2824 #ifdef ASSERT
2825   //  assert(src->klass() != nullptr);
2826   {
2827     BLOCK_COMMENT("assert klasses not null {");
2828     Label L1, L2;
2829     __ testptr(r10_src_klass, r10_src_klass);
2830     __ jcc(Assembler::notZero, L2);   // it is broken if klass is null
2831     __ bind(L1);
2832     __ stop("broken null klass");
2833     __ bind(L2);
2834     __ load_klass(rax, dst, rklass_tmp);
2835     __ cmpq(rax, 0);
2836     __ jcc(Assembler::equal, L1);     // this would be broken also
2837     BLOCK_COMMENT("} assert klasses not null done");
2838   }
2839 #endif
2840 
2841   // Load layout helper (32-bits)
2842   //
2843   //  |array_tag|     | header_size | element_type |     |log2_element_size|
2844   // 32        30    24            16              8     2                 0
2845   //
2846   //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2847   //
2848 
2849   const int lh_offset = in_bytes(Klass::layout_helper_offset());
2850 
2851   // Handle objArrays completely differently...
2852   const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2853   __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2854   __ jcc(Assembler::equal, L_objArray);
2855 
2856   //  if (src->klass() != dst->klass()) return -1;
2857   __ load_klass(rax, dst, rklass_tmp);
2858   __ cmpq(r10_src_klass, rax);
2859   __ jcc(Assembler::notEqual, L_failed);
2860 
2861   const Register rax_lh = rax;  // layout helper
2862   __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2863 
2864   //  if (!src->is_Array()) return -1;
2865   __ cmpl(rax_lh, Klass::_lh_neutral_value);
2866   __ jcc(Assembler::greaterEqual, L_failed);
2867 
2868   // At this point, it is known to be a typeArray (array_tag 0x3).
2869 #ifdef ASSERT
2870   {
2871     BLOCK_COMMENT("assert primitive array {");
2872     Label L;
2873     __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2874     __ jcc(Assembler::greaterEqual, L);
2875     __ stop("must be a primitive array");
2876     __ bind(L);
2877     BLOCK_COMMENT("} assert primitive array done");
2878   }
2879 #endif
2880 
2881   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2882                          r10, L_failed);
2883 
2884   // TypeArrayKlass
2885   //
2886   // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2887   // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2888   //
2889 
2890   const Register r10_offset = r10;    // array offset
2891   const Register rax_elsize = rax_lh; // element size
2892 
2893   __ movl(r10_offset, rax_lh);
2894   __ shrl(r10_offset, Klass::_lh_header_size_shift);
2895   __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2896   __ addptr(src, r10_offset);           // src array offset
2897   __ addptr(dst, r10_offset);           // dst array offset
2898   BLOCK_COMMENT("choose copy loop based on element size");
2899   __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2900 
2901 #ifdef _WIN64
2902   __ pop(rklass_tmp); // Restore callee-save rdi
2903 #endif
2904 
2905   // next registers should be set before the jump to corresponding stub
2906   const Register from     = c_rarg0;  // source array address
2907   const Register to       = c_rarg1;  // destination array address
2908   const Register count    = c_rarg2;  // elements count
2909 
2910   // 'from', 'to', 'count' registers should be set in such order
2911   // since they are the same as 'src', 'src_pos', 'dst'.
2912 
2913   __ cmpl(rax_elsize, 0);
2914   __ jccb(Assembler::notEqual, L_copy_shorts);
2915   __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2916   __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2917   __ movl2ptr(count, r11_length); // length
2918   __ jump(RuntimeAddress(byte_copy_entry));
2919 
2920 __ BIND(L_copy_shorts);
2921   __ cmpl(rax_elsize, LogBytesPerShort);
2922   __ jccb(Assembler::notEqual, L_copy_ints);
2923   __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2924   __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2925   __ movl2ptr(count, r11_length); // length
2926   __ jump(RuntimeAddress(short_copy_entry));
2927 
2928 __ BIND(L_copy_ints);
2929   __ cmpl(rax_elsize, LogBytesPerInt);
2930   __ jccb(Assembler::notEqual, L_copy_longs);
2931   __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2932   __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2933   __ movl2ptr(count, r11_length); // length
2934   __ jump(RuntimeAddress(int_copy_entry));
2935 
2936 __ BIND(L_copy_longs);
2937 #ifdef ASSERT
2938   {
2939     BLOCK_COMMENT("assert long copy {");
2940     Label L;
2941     __ cmpl(rax_elsize, LogBytesPerLong);
2942     __ jcc(Assembler::equal, L);
2943     __ stop("must be long copy, but elsize is wrong");
2944     __ bind(L);
2945     BLOCK_COMMENT("} assert long copy done");
2946   }
2947 #endif
2948   __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2949   __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2950   __ movl2ptr(count, r11_length); // length
2951   __ jump(RuntimeAddress(long_copy_entry));
2952 
2953   // ObjArrayKlass
2954 __ BIND(L_objArray);
2955   // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2956 
2957   Label L_plain_copy, L_checkcast_copy;
2958   //  test array classes for subtyping
2959   __ load_klass(rax, dst, rklass_tmp);
2960   __ cmpq(r10_src_klass, rax); // usual case is exact equality
2961   __ jcc(Assembler::notEqual, L_checkcast_copy);
2962 
2963   // Identically typed arrays can be copied without element-wise checks.
2964   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2965                          r10, L_failed);
2966 
2967   __ lea(from, Address(src, src_pos, TIMES_OOP,
2968                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2969   __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2970                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2971   __ movl2ptr(count, r11_length); // length
2972 __ BIND(L_plain_copy);
2973 #ifdef _WIN64
2974   __ pop(rklass_tmp); // Restore callee-save rdi
2975 #endif
2976   __ jump(RuntimeAddress(oop_copy_entry));
2977 
2978 __ BIND(L_checkcast_copy);
2979   // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2980   {
2981     // Before looking at dst.length, make sure dst is also an objArray.
2982     __ cmpl(Address(rax, lh_offset), objArray_lh);
2983     __ jcc(Assembler::notEqual, L_failed);
2984 
2985     // It is safe to examine both src.length and dst.length.
2986     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2987                            rax, L_failed);
2988 
2989     const Register r11_dst_klass = r11;
2990     __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
2991 
2992     // Marshal the base address arguments now, freeing registers.
2993     __ lea(from, Address(src, src_pos, TIMES_OOP,
2994                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2995     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2996                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2997     __ movl(count, length);           // length (reloaded)
2998     Register sco_temp = c_rarg3;      // this register is free now
2999     assert_different_registers(from, to, count, sco_temp,
3000                                r11_dst_klass, r10_src_klass);
3001     assert_clean_int(count, sco_temp);
3002 
3003     // Generate the type check.
3004     const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3005     __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3006     assert_clean_int(sco_temp, rax);
3007     generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3008 
3009     // Fetch destination element klass from the ObjArrayKlass header.
3010     int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3011     __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3012     __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3013     assert_clean_int(sco_temp, rax);
3014 
3015 #ifdef _WIN64
3016     __ pop(rklass_tmp); // Restore callee-save rdi
3017 #endif
3018 
3019     // the checkcast_copy loop needs two extra arguments:
3020     assert(c_rarg3 == sco_temp, "#3 already in place");
3021     // Set up arguments for checkcast_copy_entry.
3022     setup_arg_regs_using_thread(4);
3023     __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3024     __ jump(RuntimeAddress(checkcast_copy_entry));
3025   }
3026 
3027 __ BIND(L_failed);
3028 #ifdef _WIN64
3029   __ pop(rklass_tmp); // Restore callee-save rdi
3030 #endif
3031   __ xorptr(rax, rax);
3032   __ notptr(rax); // return -1
3033   __ leave();   // required for proper stackwalking of RuntimeStub frame
3034   __ ret(0);
3035 
3036   return start;
3037 }
3038 
3039 #undef __