1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/macroAssembler.hpp"
  26 #include "gc/shared/barrierSet.hpp"
  27 #include "gc/shared/barrierSetAssembler.hpp"
  28 #include "oops/objArrayKlass.hpp"
  29 #include "runtime/sharedRuntime.hpp"
  30 #include "runtime/stubRoutines.hpp"
  31 #include "stubGenerator_x86_64.hpp"
  32 #ifdef COMPILER2
  33 #include "opto/c2_globals.hpp"
  34 #endif
  35 #if INCLUDE_JVMCI
  36 #include "jvmci/jvmci_globals.hpp"
  37 #endif
  38 
  39 #define __ _masm->
  40 
  41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #else
  46 #define BLOCK_COMMENT(str) __ block_comment(str)
  47 #endif // PRODUCT
  48 
  49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  50 
  51 #ifdef PRODUCT
  52 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
  53 #else
  54 #define INC_COUNTER_NP(counter, rscratch) \
  55 BLOCK_COMMENT("inc_counter " #counter); \
  56 inc_counter_np(_masm, counter, rscratch);
  57 
  58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
  59   __ incrementl(ExternalAddress((address)&counter), rscratch);
  60 }
  61 
  62 #if COMPILER2_OR_JVMCI
  63 static uint& get_profile_ctr(int shift) {
  64   if (shift == 0) {
  65     return SharedRuntime::_jbyte_array_copy_ctr;
  66   } else if (shift == 1) {
  67     return SharedRuntime::_jshort_array_copy_ctr;
  68   } else if (shift == 2) {
  69     return SharedRuntime::_jint_array_copy_ctr;
  70   } else {
  71     assert(shift == 3, "");
  72     return SharedRuntime::_jlong_array_copy_ctr;
  73   }
  74 }
  75 #endif // COMPILER2_OR_JVMCI
  76 #endif // !PRODUCT
  77 
  78 void StubGenerator::generate_arraycopy_stubs() {
  79   address entry;
  80   address entry_jbyte_arraycopy;
  81   address entry_jshort_arraycopy;
  82   address entry_jint_arraycopy;
  83   address entry_oop_arraycopy;
  84   address entry_jlong_arraycopy;
  85   address entry_checkcast_arraycopy;
  86 
  87   StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
  88                                                                          "jbyte_disjoint_arraycopy");
  89   StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
  90                                                                          "jbyte_arraycopy");
  91 
  92   StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
  93                                                                           "jshort_disjoint_arraycopy");
  94   StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
  95                                                                           "jshort_arraycopy");
  96 
  97   StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
  98                                                                             "jint_disjoint_arraycopy");
  99   StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
 100                                                                             &entry_jint_arraycopy, "jint_arraycopy");
 101 
 102   StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
 103                                                                              "jlong_disjoint_arraycopy");
 104   StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
 105                                                                              &entry_jlong_arraycopy, "jlong_arraycopy");
 106   if (UseCompressedOops) {
 107     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
 108                                                                             "oop_disjoint_arraycopy");
 109     StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
 110                                                                             &entry_oop_arraycopy, "oop_arraycopy");
 111     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
 112                                                                                    "oop_disjoint_arraycopy_uninit",
 113                                                                                    /*dest_uninitialized*/true);
 114     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
 115                                                                                    nullptr, "oop_arraycopy_uninit",
 116                                                                                    /*dest_uninitialized*/true);
 117   } else {
 118     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
 119                                                                              "oop_disjoint_arraycopy");
 120     StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
 121                                                                              &entry_oop_arraycopy, "oop_arraycopy");
 122     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
 123                                                                                     "oop_disjoint_arraycopy_uninit",
 124                                                                                     /*dest_uninitialized*/true);
 125     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
 126                                                                                     nullptr, "oop_arraycopy_uninit",
 127                                                                                     /*dest_uninitialized*/true);
 128   }
 129 
 130   StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
 131   StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
 132                                                                       /*dest_uninitialized*/true);
 133 
 134   StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
 135                                                             entry_jbyte_arraycopy,
 136                                                             entry_jshort_arraycopy,
 137                                                             entry_jint_arraycopy,
 138                                                             entry_jlong_arraycopy);
 139   StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
 140                                                              entry_jbyte_arraycopy,
 141                                                              entry_jshort_arraycopy,
 142                                                              entry_jint_arraycopy,
 143                                                              entry_oop_arraycopy,
 144                                                              entry_jlong_arraycopy,
 145                                                              entry_checkcast_arraycopy);
 146 
 147   StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
 148   StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
 149   StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
 150   StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
 151   StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
 152   StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
 153 
 154   StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory("unsafe_setmemory", StubRoutines::_jbyte_fill);
 155 
 156   // We don't generate specialized code for HeapWord-aligned source
 157   // arrays, so just use the code we've already generated
 158   StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
 159   StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
 160 
 161   StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
 162   StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
 163 
 164   StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
 165   StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
 166 
 167   StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
 168   StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
 169 
 170   StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
 171   StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
 172 
 173   StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
 174   StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
 175 }
 176 
 177 
 178 // Verify that a register contains clean 32-bits positive value
 179 // (high 32-bits are 0) so it could be used in 64-bits shifts.
 180 //
 181 //  Input:
 182 //    Rint  -  32-bits value
 183 //    Rtmp  -  scratch
 184 //
 185 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
 186 #ifdef ASSERT
 187   Label L;
 188   assert_different_registers(Rtmp, Rint);
 189   __ movslq(Rtmp, Rint);
 190   __ cmpq(Rtmp, Rint);
 191   __ jcc(Assembler::equal, L);
 192   __ stop("high 32-bits of int value are not 0");
 193   __ bind(L);
 194 #endif
 195 }
 196 
 197 
 198 //  Generate overlap test for array copy stubs
 199 //
 200 //  Input:
 201 //     c_rarg0 - from
 202 //     c_rarg1 - to
 203 //     c_rarg2 - element count
 204 //
 205 //  Output:
 206 //     rax   - &from[element count - 1]
 207 //
 208 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
 209   const Register from     = c_rarg0;
 210   const Register to       = c_rarg1;
 211   const Register count    = c_rarg2;
 212   const Register end_from = rax;
 213 
 214   __ cmpptr(to, from);
 215   __ lea(end_from, Address(from, count, sf, 0));
 216   if (NOLp == nullptr) {
 217     RuntimeAddress no_overlap(no_overlap_target);
 218     __ jump_cc(Assembler::belowEqual, no_overlap);
 219     __ cmpptr(to, end_from);
 220     __ jump_cc(Assembler::aboveEqual, no_overlap);
 221   } else {
 222     __ jcc(Assembler::belowEqual, (*NOLp));
 223     __ cmpptr(to, end_from);
 224     __ jcc(Assembler::aboveEqual, (*NOLp));
 225   }
 226 }
 227 
 228 
 229 // Copy big chunks forward
 230 //
 231 // Inputs:
 232 //   end_from     - source arrays end address
 233 //   end_to       - destination array end address
 234 //   qword_count  - 64-bits element count, negative
 235 //   tmp1         - scratch
 236 //   L_copy_bytes - entry label
 237 //   L_copy_8_bytes  - exit  label
 238 //
 239 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
 240                                        Register qword_count, Register tmp1,
 241                                        Register tmp2, Label& L_copy_bytes,
 242                                        Label& L_copy_8_bytes, DecoratorSet decorators,
 243                                        BasicType type) {
 244   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 245   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 246   Label L_loop;
 247   __ align(OptoLoopAlignment);
 248   if (UseUnalignedLoadStores) {
 249     Label L_end;
 250     __ BIND(L_loop);
 251     if (UseAVX >= 2) {
 252       bs->copy_load_at(_masm, decorators, type, 32,
 253                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 254                        tmp1, xmm1);
 255       bs->copy_store_at(_masm, decorators, type, 32,
 256                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 257                         tmp1, tmp2, xmm1);
 258 
 259       bs->copy_load_at(_masm, decorators, type, 32,
 260                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 261                        tmp1, xmm1);
 262       bs->copy_store_at(_masm, decorators, type, 32,
 263                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 264                         tmp1, tmp2, xmm1);
 265     } else {
 266       bs->copy_load_at(_masm, decorators, type, 16,
 267                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 268                        tmp1, xmm1);
 269       bs->copy_store_at(_masm, decorators, type, 16,
 270                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 271                         tmp1, tmp2, xmm1);
 272       bs->copy_load_at(_masm, decorators, type, 16,
 273                        xmm0, Address(end_from, qword_count, Address::times_8, -40),
 274                        tmp1, xmm1);
 275       bs->copy_store_at(_masm, decorators, type, 16,
 276                         Address(end_to, qword_count, Address::times_8, -40), xmm0,
 277                         tmp1, tmp2, xmm1);
 278       bs->copy_load_at(_masm, decorators, type, 16,
 279                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 280                        tmp1, xmm1);
 281       bs->copy_store_at(_masm, decorators, type, 16,
 282                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 283                         tmp1, tmp2, xmm1);
 284       bs->copy_load_at(_masm, decorators, type, 16,
 285                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 286                        tmp1, xmm1);
 287       bs->copy_store_at(_masm, decorators, type, 16,
 288                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 289                         tmp1, tmp2, xmm1);
 290     }
 291 
 292     __ BIND(L_copy_bytes);
 293     __ addptr(qword_count, 8);
 294     __ jcc(Assembler::lessEqual, L_loop);
 295     __ subptr(qword_count, 4);  // sub(8) and add(4)
 296     __ jcc(Assembler::greater, L_end);
 297     // Copy trailing 32 bytes
 298     if (UseAVX >= 2) {
 299       bs->copy_load_at(_masm, decorators, type, 32,
 300                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 301                        tmp1, xmm1);
 302       bs->copy_store_at(_masm, decorators, type, 32,
 303                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 304                         tmp1, tmp2, xmm1);
 305     } else {
 306       bs->copy_load_at(_masm, decorators, type, 16,
 307                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 308                        tmp1, xmm1);
 309       bs->copy_store_at(_masm, decorators, type, 16,
 310                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 311                         tmp1, tmp2, xmm1);
 312       bs->copy_load_at(_masm, decorators, type, 16,
 313                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 314                        tmp1, xmm1);
 315       bs->copy_store_at(_masm, decorators, type, 16,
 316                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 317                         tmp1, tmp2, xmm1);
 318     }
 319     __ addptr(qword_count, 4);
 320     __ BIND(L_end);
 321   } else {
 322     // Copy 32-bytes per iteration
 323     __ BIND(L_loop);
 324     bs->copy_load_at(_masm, decorators, type, 8,
 325                      tmp1, Address(end_from, qword_count, Address::times_8, -24),
 326                      tmp2);
 327     bs->copy_store_at(_masm, decorators, type, 8,
 328                       Address(end_to, qword_count, Address::times_8, -24), tmp1,
 329                       tmp2);
 330     bs->copy_load_at(_masm, decorators, type, 8,
 331                      tmp1, Address(end_from, qword_count, Address::times_8, -16),
 332                      tmp2);
 333     bs->copy_store_at(_masm, decorators, type, 8,
 334                       Address(end_to, qword_count, Address::times_8, -16), tmp1,
 335                       tmp2);
 336     bs->copy_load_at(_masm, decorators, type, 8,
 337                      tmp1, Address(end_from, qword_count, Address::times_8, -8),
 338                      tmp2);
 339     bs->copy_store_at(_masm, decorators, type, 8,
 340                       Address(end_to, qword_count, Address::times_8, -8), tmp1,
 341                       tmp2);
 342     bs->copy_load_at(_masm, decorators, type, 8,
 343                      tmp1, Address(end_from, qword_count, Address::times_8, 0),
 344                      tmp2);
 345     bs->copy_store_at(_masm, decorators, type, 8,
 346                       Address(end_to, qword_count, Address::times_8, 0), tmp1,
 347                       tmp2);
 348 
 349     __ BIND(L_copy_bytes);
 350     __ addptr(qword_count, 4);
 351     __ jcc(Assembler::lessEqual, L_loop);
 352   }
 353   __ subptr(qword_count, 4);
 354   __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
 355 }
 356 
 357 
 358 // Copy big chunks backward
 359 //
 360 // Inputs:
 361 //   from         - source arrays address
 362 //   dest         - destination array address
 363 //   qword_count  - 64-bits element count
 364 //   tmp1         - scratch
 365 //   L_copy_bytes - entry label
 366 //   L_copy_8_bytes  - exit  label
 367 //
 368 void StubGenerator::copy_bytes_backward(Register from, Register dest,
 369                                         Register qword_count, Register tmp1,
 370                                         Register tmp2, Label& L_copy_bytes,
 371                                         Label& L_copy_8_bytes, DecoratorSet decorators,
 372                                         BasicType type) {
 373   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 374   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 375   Label L_loop;
 376   __ align(OptoLoopAlignment);
 377   if (UseUnalignedLoadStores) {
 378     Label L_end;
 379     __ BIND(L_loop);
 380     if (UseAVX >= 2) {
 381       bs->copy_load_at(_masm, decorators, type, 32,
 382                        xmm0, Address(from, qword_count, Address::times_8, 32),
 383                        tmp1, xmm1);
 384       bs->copy_store_at(_masm, decorators, type, 32,
 385                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 386                         tmp1, tmp2, xmm1);
 387       bs->copy_load_at(_masm, decorators, type, 32,
 388                        xmm0, Address(from, qword_count, Address::times_8, 0),
 389                        tmp1, xmm1);
 390       bs->copy_store_at(_masm, decorators, type, 32,
 391                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 392                         tmp1, tmp2, xmm1);
 393     } else {
 394       bs->copy_load_at(_masm, decorators, type, 16,
 395                        xmm0, Address(from, qword_count, Address::times_8, 48),
 396                        tmp1, xmm1);
 397       bs->copy_store_at(_masm, decorators, type, 16,
 398                         Address(dest, qword_count, Address::times_8, 48), xmm0,
 399                         tmp1, tmp2, xmm1);
 400       bs->copy_load_at(_masm, decorators, type, 16,
 401                        xmm0, Address(from, qword_count, Address::times_8, 32),
 402                        tmp1, xmm1);
 403       bs->copy_store_at(_masm, decorators, type, 16,
 404                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 405                         tmp1, tmp2, xmm1);
 406       bs->copy_load_at(_masm, decorators, type, 16,
 407                        xmm0, Address(from, qword_count, Address::times_8, 16),
 408                        tmp1, xmm1);
 409       bs->copy_store_at(_masm, decorators, type, 16,
 410                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 411                         tmp1, tmp2, xmm1);
 412       bs->copy_load_at(_masm, decorators, type, 16,
 413                        xmm0, Address(from, qword_count, Address::times_8, 0),
 414                        tmp1, xmm1);
 415       bs->copy_store_at(_masm, decorators, type, 16,
 416                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 417                         tmp1, tmp2, xmm1);
 418     }
 419 
 420     __ BIND(L_copy_bytes);
 421     __ subptr(qword_count, 8);
 422     __ jcc(Assembler::greaterEqual, L_loop);
 423 
 424     __ addptr(qword_count, 4);  // add(8) and sub(4)
 425     __ jcc(Assembler::less, L_end);
 426     // Copy trailing 32 bytes
 427     if (UseAVX >= 2) {
 428       bs->copy_load_at(_masm, decorators, type, 32,
 429                        xmm0, Address(from, qword_count, Address::times_8, 0),
 430                        tmp1, xmm1);
 431       bs->copy_store_at(_masm, decorators, type, 32,
 432                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 433                         tmp1, tmp2, xmm1);
 434     } else {
 435       bs->copy_load_at(_masm, decorators, type, 16,
 436                        xmm0, Address(from, qword_count, Address::times_8, 16),
 437                        tmp1, xmm1);
 438       bs->copy_store_at(_masm, decorators, type, 16,
 439                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 440                         tmp1, tmp2, xmm1);
 441       bs->copy_load_at(_masm, decorators, type, 16,
 442                        xmm0, Address(from, qword_count, Address::times_8, 0),
 443                        tmp1, xmm1);
 444       bs->copy_store_at(_masm, decorators, type, 16,
 445                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 446                         tmp1, tmp2, xmm1);
 447     }
 448     __ subptr(qword_count, 4);
 449     __ BIND(L_end);
 450   } else {
 451     // Copy 32-bytes per iteration
 452     __ BIND(L_loop);
 453     bs->copy_load_at(_masm, decorators, type, 8,
 454                      tmp1, Address(from, qword_count, Address::times_8, 24),
 455                      tmp2);
 456     bs->copy_store_at(_masm, decorators, type, 8,
 457                       Address(dest, qword_count, Address::times_8, 24), tmp1,
 458                       tmp2);
 459     bs->copy_load_at(_masm, decorators, type, 8,
 460                      tmp1, Address(from, qword_count, Address::times_8, 16),
 461                      tmp2);
 462     bs->copy_store_at(_masm, decorators, type, 8,
 463                       Address(dest, qword_count, Address::times_8, 16), tmp1,
 464                       tmp2);
 465     bs->copy_load_at(_masm, decorators, type, 8,
 466                      tmp1, Address(from, qword_count, Address::times_8, 8),
 467                      tmp2);
 468     bs->copy_store_at(_masm, decorators, type, 8,
 469                       Address(dest, qword_count, Address::times_8, 8), tmp1,
 470                       tmp2);
 471     bs->copy_load_at(_masm, decorators, type, 8,
 472                      tmp1, Address(from, qword_count, Address::times_8, 0),
 473                      tmp2);
 474     bs->copy_store_at(_masm, decorators, type, 8,
 475                       Address(dest, qword_count, Address::times_8, 0), tmp1,
 476                       tmp2);
 477 
 478     __ BIND(L_copy_bytes);
 479     __ subptr(qword_count, 4);
 480     __ jcc(Assembler::greaterEqual, L_loop);
 481   }
 482   __ addptr(qword_count, 4);
 483   __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
 484 }
 485 
 486 #if COMPILER2_OR_JVMCI
 487 
 488 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
 489 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 490 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 491 //   default configuration.
 492 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 493 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 494 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 495 //   better performance for disjoint copies. For conjoint/backward copy vector based
 496 //   copy performs better.
 497 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
 498 //   64 byte vector registers (ZMMs).
 499 
 500 // Inputs:
 501 //   c_rarg0   - source array address
 502 //   c_rarg1   - destination array address
 503 //   c_rarg2   - element count, treated as ssize_t, can be zero
 504 //
 505 //
 506 // Side Effects:
 507 //   disjoint_copy_avx3_masked is set to the no-overlap entry point
 508 //   used by generate_conjoint_[byte/int/short/long]_copy().
 509 //
 510 address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const char *name,
 511                                                           int shift, bool aligned, bool is_oop,
 512                                                           bool dest_uninitialized) {
 513   __ align(CodeEntryAlignment);
 514   StubCodeMark mark(this, "StubRoutines", name);
 515   address start = __ pc();
 516 
 517   int avx3threshold = VM_Version::avx3_threshold();
 518   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 519   const int large_threshold = 2621440; // 2.5 MB
 520   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 521   Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 522   Label L_copy_large, L_finish;
 523   const Register from        = rdi;  // source array address
 524   const Register to          = rsi;  // destination array address
 525   const Register count       = rdx;  // elements count
 526   const Register temp1       = r8;
 527   const Register temp2       = r11;
 528   const Register temp3       = rax;
 529   const Register temp4       = rcx;
 530   // End pointers are inclusive, and if count is not zero they point
 531   // to the last unit copied:  end_to[0] := end_from[0]
 532 
 533   __ enter(); // required for proper stackwalking of RuntimeStub frame
 534   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 535 
 536   if (entry != nullptr) {
 537     *entry = __ pc();
 538      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 539     BLOCK_COMMENT("Entry:");
 540   }
 541 
 542   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 543   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 544 
 545   setup_argument_regs(type);
 546 
 547   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 548   if (dest_uninitialized) {
 549     decorators |= IS_DEST_UNINITIALIZED;
 550   }
 551   if (aligned) {
 552     decorators |= ARRAYCOPY_ALIGNED;
 553   }
 554   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 555   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 556 
 557   {
 558     // Type(shift)           byte(0), short(1), int(2),   long(3)
 559     int loop_size[]        = { 192,     96,       48,      24};
 560     int threshold[]        = { 4096,    2048,     1024,    512};
 561 
 562     // UnsafeMemoryAccess page error: continue after unsafe access
 563     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 564     // 'from', 'to' and 'count' are now valid
 565 
 566     // temp1 holds remaining count and temp4 holds running count used to compute
 567     // next address offset for start of to/from addresses (temp4 * scale).
 568     __ mov64(temp4, 0);
 569     __ movq(temp1, count);
 570 
 571     // Zero length check.
 572     __ BIND(L_tail);
 573     __ cmpq(temp1, 0);
 574     __ jcc(Assembler::lessEqual, L_exit);
 575 
 576     // Special cases using 32 byte [masked] vector copy operations.
 577     arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 578                                  temp4, temp3, use64byteVector, L_entry, L_exit);
 579 
 580     // PRE-MAIN-POST loop for aligned copy.
 581     __ BIND(L_entry);
 582 
 583     if (MaxVectorSize == 64) {
 584       __ movq(temp2, temp1);
 585       __ shlq(temp2, shift);
 586       __ cmpq(temp2, large_threshold);
 587       __ jcc(Assembler::greaterEqual, L_copy_large);
 588     }
 589     if (avx3threshold != 0) {
 590       __ cmpq(count, threshold[shift]);
 591       if (MaxVectorSize == 64) {
 592         // Copy using 64 byte vectors.
 593         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 594       } else {
 595         assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
 596         // REP MOVS offer a faster copy path.
 597         __ jcc(Assembler::greaterEqual, L_repmovs);
 598       }
 599     }
 600 
 601     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 602       // Partial copy to make dst address 32 byte aligned.
 603       __ movq(temp2, to);
 604       __ andq(temp2, 31);
 605       __ jcc(Assembler::equal, L_main_pre_loop);
 606 
 607       __ negptr(temp2);
 608       __ addq(temp2, 32);
 609       if (shift) {
 610         __ shrq(temp2, shift);
 611       }
 612       __ movq(temp3, temp2);
 613       copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
 614       __ movq(temp4, temp2);
 615       __ movq(temp1, count);
 616       __ subq(temp1, temp2);
 617 
 618       __ cmpq(temp1, loop_size[shift]);
 619       __ jcc(Assembler::less, L_tail);
 620 
 621       __ BIND(L_main_pre_loop);
 622       __ subq(temp1, loop_size[shift]);
 623 
 624       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 625       __ align32();
 626       __ BIND(L_main_loop);
 627          copy64_avx(to, from, temp4, xmm1, false, shift, 0);
 628          copy64_avx(to, from, temp4, xmm1, false, shift, 64);
 629          copy64_avx(to, from, temp4, xmm1, false, shift, 128);
 630          __ addptr(temp4, loop_size[shift]);
 631          __ subq(temp1, loop_size[shift]);
 632          __ jcc(Assembler::greater, L_main_loop);
 633 
 634       __ addq(temp1, loop_size[shift]);
 635 
 636       // Tail loop.
 637       __ jmp(L_tail);
 638 
 639       __ BIND(L_repmovs);
 640         __ movq(temp2, temp1);
 641         // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
 642         __ movq(temp3, to);
 643         __ movq(to,  from);
 644         __ movq(from, temp3);
 645         // Save to/from for restoration post rep_mov.
 646         __ movq(temp1, to);
 647         __ movq(temp3, from);
 648         if(shift < 3) {
 649           __ shrq(temp2, 3-shift);     // quad word count
 650         }
 651         __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
 652         __ rep_mov();
 653         __ shlq(temp2, 3);             // convert quad words into byte count.
 654         if(shift) {
 655           __ shrq(temp2, shift);       // type specific count.
 656         }
 657         // Restore original addresses in to/from.
 658         __ movq(to, temp3);
 659         __ movq(from, temp1);
 660         __ movq(temp4, temp2);
 661         __ movq(temp1, count);
 662         __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
 663         __ jmp(L_tail);
 664     }
 665 
 666     if (MaxVectorSize > 32) {
 667       __ BIND(L_pre_main_post_64);
 668       // Partial copy to make dst address 64 byte aligned.
 669       __ movq(temp2, to);
 670       __ andq(temp2, 63);
 671       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 672 
 673       __ negptr(temp2);
 674       __ addq(temp2, 64);
 675       if (shift) {
 676         __ shrq(temp2, shift);
 677       }
 678       __ movq(temp3, temp2);
 679       copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
 680       __ movq(temp4, temp2);
 681       __ movq(temp1, count);
 682       __ subq(temp1, temp2);
 683 
 684       __ cmpq(temp1, loop_size[shift]);
 685       __ jcc(Assembler::less, L_tail64);
 686 
 687       __ BIND(L_main_pre_loop_64bytes);
 688       __ subq(temp1, loop_size[shift]);
 689 
 690       // Main loop with aligned copy block size of 192 bytes at
 691       // 64 byte copy granularity.
 692       __ align32();
 693       __ BIND(L_main_loop_64bytes);
 694          copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
 695          copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
 696          copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
 697          __ addptr(temp4, loop_size[shift]);
 698          __ subq(temp1, loop_size[shift]);
 699          __ jcc(Assembler::greater, L_main_loop_64bytes);
 700 
 701       __ addq(temp1, loop_size[shift]);
 702       // Zero length check.
 703       __ jcc(Assembler::lessEqual, L_exit);
 704 
 705       __ BIND(L_tail64);
 706 
 707       // Tail handling using 64 byte [masked] vector copy operations.
 708       use64byteVector = true;
 709       arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 710                                    temp4, temp3, use64byteVector, L_entry, L_exit);
 711     }
 712     __ BIND(L_exit);
 713   }
 714 
 715   __ BIND(L_finish);
 716   address ucme_exit_pc = __ pc();
 717   // When called from generic_arraycopy r11 contains specific values
 718   // used during arraycopy epilogue, re-initializing r11.
 719   if (is_oop) {
 720     __ movq(r11, shift == 3 ? count : to);
 721   }
 722   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 723   restore_argument_regs(type);
 724   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 725   __ xorptr(rax, rax); // return 0
 726   __ vzeroupper();
 727   __ leave(); // required for proper stackwalking of RuntimeStub frame
 728   __ ret(0);
 729 
 730   if (MaxVectorSize == 64) {
 731     __ BIND(L_copy_large);
 732       UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
 733       arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
 734     __ jmp(L_finish);
 735   }
 736   return start;
 737 }
 738 
 739 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
 740                                          Register temp3, Register temp4, Register count,
 741                                          XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
 742                                          XMMRegister xmm4, int shift) {
 743 
 744   // Type(shift)           byte(0), short(1), int(2),   long(3)
 745   int loop_size[]        = { 256,     128,       64,      32};
 746   int threshold[]        = { 4096,    2048,     1024,    512};
 747 
 748   Label L_main_loop_large;
 749   Label L_tail_large;
 750   Label L_exit_large;
 751   Label L_entry_large;
 752   Label L_main_pre_loop_large;
 753   Label L_pre_main_post_large;
 754 
 755   assert(MaxVectorSize == 64, "vector length != 64");
 756   __ BIND(L_entry_large);
 757 
 758   __ BIND(L_pre_main_post_large);
 759   // Partial copy to make dst address 64 byte aligned.
 760   __ movq(temp2, to);
 761   __ andq(temp2, 63);
 762   __ jcc(Assembler::equal, L_main_pre_loop_large);
 763 
 764   __ negptr(temp2);
 765   __ addq(temp2, 64);
 766   if (shift) {
 767     __ shrq(temp2, shift);
 768   }
 769   __ movq(temp3, temp2);
 770   copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
 771   __ movq(temp4, temp2);
 772   __ movq(temp1, count);
 773   __ subq(temp1, temp2);
 774 
 775   __ cmpq(temp1, loop_size[shift]);
 776   __ jcc(Assembler::less, L_tail_large);
 777 
 778   __ BIND(L_main_pre_loop_large);
 779   __ subq(temp1, loop_size[shift]);
 780 
 781   // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
 782   __ align32();
 783   __ BIND(L_main_loop_large);
 784   copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
 785   __ addptr(temp4, loop_size[shift]);
 786   __ subq(temp1, loop_size[shift]);
 787   __ jcc(Assembler::greater, L_main_loop_large);
 788   // fence needed because copy256_avx3 uses non-temporal stores
 789   __ sfence();
 790 
 791   __ addq(temp1, loop_size[shift]);
 792   // Zero length check.
 793   __ jcc(Assembler::lessEqual, L_exit_large);
 794   __ BIND(L_tail_large);
 795   // Tail handling using 64 byte [masked] vector copy operations.
 796   __ cmpq(temp1, 0);
 797   __ jcc(Assembler::lessEqual, L_exit_large);
 798   arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
 799                                temp4, temp3, L_exit_large);
 800   __ BIND(L_exit_large);
 801 }
 802 
 803 // Inputs:
 804 //   c_rarg0   - source array address
 805 //   c_rarg1   - destination array address
 806 //   c_rarg2   - element count, treated as ssize_t, can be zero
 807 //
 808 //
 809 address StubGenerator::generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
 810                                                           address nooverlap_target, bool aligned,
 811                                                           bool is_oop, bool dest_uninitialized) {
 812   __ align(CodeEntryAlignment);
 813   StubCodeMark mark(this, "StubRoutines", name);
 814   address start = __ pc();
 815 
 816   int avx3threshold = VM_Version::avx3_threshold();
 817   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 818 
 819   Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 820   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 821   const Register from        = rdi;  // source array address
 822   const Register to          = rsi;  // destination array address
 823   const Register count       = rdx;  // elements count
 824   const Register temp1       = r8;
 825   const Register temp2       = rcx;
 826   const Register temp3       = r11;
 827   const Register temp4       = rax;
 828   // End pointers are inclusive, and if count is not zero they point
 829   // to the last unit copied:  end_to[0] := end_from[0]
 830 
 831   __ enter(); // required for proper stackwalking of RuntimeStub frame
 832   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 833 
 834   if (entry != nullptr) {
 835     *entry = __ pc();
 836      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 837     BLOCK_COMMENT("Entry:");
 838   }
 839 
 840   array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
 841 
 842   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 843   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 844 
 845   setup_argument_regs(type);
 846 
 847   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 848   if (dest_uninitialized) {
 849     decorators |= IS_DEST_UNINITIALIZED;
 850   }
 851   if (aligned) {
 852     decorators |= ARRAYCOPY_ALIGNED;
 853   }
 854   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 855   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 856   {
 857     // Type(shift)       byte(0), short(1), int(2),   long(3)
 858     int loop_size[]   = { 192,     96,       48,      24};
 859     int threshold[]   = { 4096,    2048,     1024,    512};
 860 
 861     // UnsafeMemoryAccess page error: continue after unsafe access
 862     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 863     // 'from', 'to' and 'count' are now valid
 864 
 865     // temp1 holds remaining count.
 866     __ movq(temp1, count);
 867 
 868     // Zero length check.
 869     __ BIND(L_tail);
 870     __ cmpq(temp1, 0);
 871     __ jcc(Assembler::lessEqual, L_exit);
 872 
 873     __ mov64(temp2, 0);
 874     __ movq(temp3, temp1);
 875     // Special cases using 32 byte [masked] vector copy operations.
 876     arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 877                                           temp4, use64byteVector, L_entry, L_exit);
 878 
 879     // PRE-MAIN-POST loop for aligned copy.
 880     __ BIND(L_entry);
 881 
 882     if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
 883       __ cmpq(temp1, threshold[shift]);
 884       __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 885     }
 886 
 887     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 888       // Partial copy to make dst address 32 byte aligned.
 889       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 890       __ andq(temp2, 31);
 891       __ jcc(Assembler::equal, L_main_pre_loop);
 892 
 893       if (shift) {
 894         __ shrq(temp2, shift);
 895       }
 896       __ subq(temp1, temp2);
 897       copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
 898 
 899       __ cmpq(temp1, loop_size[shift]);
 900       __ jcc(Assembler::less, L_tail);
 901 
 902       __ BIND(L_main_pre_loop);
 903 
 904       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 905       __ align32();
 906       __ BIND(L_main_loop);
 907          copy64_avx(to, from, temp1, xmm1, true, shift, -64);
 908          copy64_avx(to, from, temp1, xmm1, true, shift, -128);
 909          copy64_avx(to, from, temp1, xmm1, true, shift, -192);
 910          __ subptr(temp1, loop_size[shift]);
 911          __ cmpq(temp1, loop_size[shift]);
 912          __ jcc(Assembler::greater, L_main_loop);
 913 
 914       // Tail loop.
 915       __ jmp(L_tail);
 916     }
 917 
 918     if (MaxVectorSize > 32) {
 919       __ BIND(L_pre_main_post_64);
 920       // Partial copy to make dst address 64 byte aligned.
 921       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 922       __ andq(temp2, 63);
 923       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 924 
 925       if (shift) {
 926         __ shrq(temp2, shift);
 927       }
 928       __ subq(temp1, temp2);
 929       copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
 930 
 931       __ cmpq(temp1, loop_size[shift]);
 932       __ jcc(Assembler::less, L_tail64);
 933 
 934       __ BIND(L_main_pre_loop_64bytes);
 935 
 936       // Main loop with aligned copy block size of 192 bytes at
 937       // 64 byte copy granularity.
 938       __ align32();
 939       __ BIND(L_main_loop_64bytes);
 940          copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
 941          copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
 942          copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
 943          __ subq(temp1, loop_size[shift]);
 944          __ cmpq(temp1, loop_size[shift]);
 945          __ jcc(Assembler::greater, L_main_loop_64bytes);
 946 
 947       // Zero length check.
 948       __ cmpq(temp1, 0);
 949       __ jcc(Assembler::lessEqual, L_exit);
 950 
 951       __ BIND(L_tail64);
 952 
 953       // Tail handling using 64 byte [masked] vector copy operations.
 954       use64byteVector = true;
 955       __ mov64(temp2, 0);
 956       __ movq(temp3, temp1);
 957       arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 958                                             temp4, use64byteVector, L_entry, L_exit);
 959     }
 960     __ BIND(L_exit);
 961   }
 962   address ucme_exit_pc = __ pc();
 963   // When called from generic_arraycopy r11 contains specific values
 964   // used during arraycopy epilogue, re-initializing r11.
 965   if(is_oop) {
 966     __ movq(r11, count);
 967   }
 968   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 969   restore_argument_regs(type);
 970   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 971   __ xorptr(rax, rax); // return 0
 972   __ vzeroupper();
 973   __ leave(); // required for proper stackwalking of RuntimeStub frame
 974   __ ret(0);
 975 
 976   return start;
 977 }
 978 
 979 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
 980                                                  Register to, Register count, int shift,
 981                                                  Register index, Register temp,
 982                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
 983   Label L_entry_64, L_entry_96, L_entry_128;
 984   Label L_entry_160, L_entry_192;
 985 
 986   int size_mat[][6] = {
 987   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
 988   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
 989   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
 990   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
 991   };
 992 
 993   // Case A) Special case for length less than equal to 32 bytes.
 994   __ cmpq(count, size_mat[shift][0]);
 995   __ jccb(Assembler::greater, L_entry_64);
 996   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
 997   __ jmp(L_exit);
 998 
 999   // Case B) Special case for length less than equal to 64 bytes.
1000   __ BIND(L_entry_64);
1001   __ cmpq(count, size_mat[shift][1]);
1002   __ jccb(Assembler::greater, L_entry_96);
1003   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1004   __ jmp(L_exit);
1005 
1006   // Case C) Special case for length less than equal to 96 bytes.
1007   __ BIND(L_entry_96);
1008   __ cmpq(count, size_mat[shift][2]);
1009   __ jccb(Assembler::greater, L_entry_128);
1010   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1011   __ subq(count, 64 >> shift);
1012   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1013   __ jmp(L_exit);
1014 
1015   // Case D) Special case for length less than equal to 128 bytes.
1016   __ BIND(L_entry_128);
1017   __ cmpq(count, size_mat[shift][3]);
1018   __ jccb(Assembler::greater, L_entry_160);
1019   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1020   copy32_avx(to, from, index, xmm, shift, 64);
1021   __ subq(count, 96 >> shift);
1022   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1023   __ jmp(L_exit);
1024 
1025   // Case E) Special case for length less than equal to 160 bytes.
1026   __ BIND(L_entry_160);
1027   __ cmpq(count, size_mat[shift][4]);
1028   __ jccb(Assembler::greater, L_entry_192);
1029   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1030   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1031   __ subq(count, 128 >> shift);
1032   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1033   __ jmp(L_exit);
1034 
1035   // Case F) Special case for length less than equal to 192 bytes.
1036   __ BIND(L_entry_192);
1037   __ cmpq(count, size_mat[shift][5]);
1038   __ jcc(Assembler::greater, L_entry);
1039   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1040   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1041   copy32_avx(to, from, index, xmm, shift, 128);
1042   __ subq(count, 160 >> shift);
1043   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1044   __ jmp(L_exit);
1045 }
1046 
1047 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1048                                                      Register to, Register count, int shift, Register index,
1049                                                      Register temp, Label& L_exit) {
1050   Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1051 
1052   int size_mat[][4] = {
1053   /* T_BYTE */ {64, 128, 192, 256},
1054   /* T_SHORT*/ {32, 64 , 96 , 128},
1055   /* T_INT  */ {16, 32 , 48 ,  64},
1056   /* T_LONG */ { 8, 16 , 24 ,  32}
1057   };
1058 
1059   assert(MaxVectorSize == 64, "vector length != 64");
1060   // Case A) Special case for length less than or equal to 64 bytes.
1061   __ BIND(L_entry_64);
1062   __ cmpq(count, size_mat[shift][0]);
1063   __ jccb(Assembler::greater, L_entry_128);
1064   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1065   __ jmp(L_exit);
1066 
1067   // Case B) Special case for length less than or equal to 128 bytes.
1068   __ BIND(L_entry_128);
1069   __ cmpq(count, size_mat[shift][1]);
1070   __ jccb(Assembler::greater, L_entry_192);
1071   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1072   __ subq(count, 64 >> shift);
1073   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1074   __ jmp(L_exit);
1075 
1076   // Case C) Special case for length less than or equal to 192 bytes.
1077   __ BIND(L_entry_192);
1078   __ cmpq(count, size_mat[shift][2]);
1079   __ jcc(Assembler::greater, L_entry_256);
1080   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1081   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1082   __ subq(count, 128 >> shift);
1083   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1084   __ jmp(L_exit);
1085 
1086   // Case D) Special case for length less than or equal to 256 bytes.
1087   __ BIND(L_entry_256);
1088   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1089   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1090   copy64_avx(to, from, index, xmm, false, shift, 128, true);
1091   __ subq(count, 192 >> shift);
1092   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1093   __ jmp(L_exit);
1094 }
1095 
1096 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1097                                                            Register to, Register start_index, Register end_index,
1098                                                            Register count, int shift, Register temp,
1099                                                            bool use64byteVector, Label& L_entry, Label& L_exit) {
1100   Label L_entry_64, L_entry_96, L_entry_128;
1101   Label L_entry_160, L_entry_192;
1102   bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
1103 
1104   int size_mat[][6] = {
1105   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1106   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1107   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1108   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1109   };
1110 
1111   // Case A) Special case for length less than equal to 32 bytes.
1112   __ cmpq(count, size_mat[shift][0]);
1113   __ jccb(Assembler::greater, L_entry_64);
1114   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1115   __ jmp(L_exit);
1116 
1117   // Case B) Special case for length less than equal to 64 bytes.
1118   __ BIND(L_entry_64);
1119   __ cmpq(count, size_mat[shift][1]);
1120   __ jccb(Assembler::greater, L_entry_96);
1121   if (avx3) {
1122      copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1123   } else {
1124      copy32_avx(to, from, end_index, xmm, shift, -32);
1125      __ subq(count, 32 >> shift);
1126      copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1127   }
1128   __ jmp(L_exit);
1129 
1130   // Case C) Special case for length less than equal to 96 bytes.
1131   __ BIND(L_entry_96);
1132   __ cmpq(count, size_mat[shift][2]);
1133   __ jccb(Assembler::greater, L_entry_128);
1134   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1135   __ subq(count, 64 >> shift);
1136   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1137   __ jmp(L_exit);
1138 
1139   // Case D) Special case for length less than equal to 128 bytes.
1140   __ BIND(L_entry_128);
1141   __ cmpq(count, size_mat[shift][3]);
1142   __ jccb(Assembler::greater, L_entry_160);
1143   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1144   copy32_avx(to, from, end_index, xmm, shift, -96);
1145   __ subq(count, 96 >> shift);
1146   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1147   __ jmp(L_exit);
1148 
1149   // Case E) Special case for length less than equal to 160 bytes.
1150   __ BIND(L_entry_160);
1151   __ cmpq(count, size_mat[shift][4]);
1152   __ jccb(Assembler::greater, L_entry_192);
1153   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1154   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1155   __ subq(count, 128 >> shift);
1156   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1157   __ jmp(L_exit);
1158 
1159   // Case F) Special case for length less than equal to 192 bytes.
1160   __ BIND(L_entry_192);
1161   __ cmpq(count, size_mat[shift][5]);
1162   __ jcc(Assembler::greater, L_entry);
1163   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1164   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1165   copy32_avx(to, from, end_index, xmm, shift, -160);
1166   __ subq(count, 160 >> shift);
1167   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1168   __ jmp(L_exit);
1169 }
1170 
1171 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1172                                 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1173                                 int shift, int offset) {
1174   if (MaxVectorSize == 64) {
1175     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1176     __ prefetcht0(Address(src, index, scale, offset + 0x200));
1177     __ prefetcht0(Address(src, index, scale, offset + 0x240));
1178     __ prefetcht0(Address(src, index, scale, offset + 0x280));
1179     __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1180 
1181     __ prefetcht0(Address(src, index, scale, offset + 0x400));
1182     __ prefetcht0(Address(src, index, scale, offset + 0x440));
1183     __ prefetcht0(Address(src, index, scale, offset + 0x480));
1184     __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1185 
1186     __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1187     __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1188     __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1189     __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1190 
1191     __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1192     __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1193     __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1194     __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1195   }
1196 }
1197 
1198 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1199                                        KRegister mask, Register length, Register index,
1200                                        Register temp, int shift, int offset,
1201                                        bool use64byteVector) {
1202   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1203   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1204   if (!use64byteVector) {
1205     copy32_avx(dst, src, index, xmm, shift, offset);
1206     __ subptr(length, 32 >> shift);
1207     copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1208   } else {
1209     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1210     assert(MaxVectorSize == 64, "vector length != 64");
1211     __ mov64(temp, -1L);
1212     __ bzhiq(temp, temp, length);
1213     __ kmovql(mask, temp);
1214     __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1215     __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1216   }
1217 }
1218 
1219 
1220 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1221                                        KRegister mask, Register length, Register index,
1222                                        Register temp, int shift, int offset) {
1223   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1224   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1225   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1226   __ mov64(temp, -1L);
1227   __ bzhiq(temp, temp, length);
1228   __ kmovql(mask, temp);
1229   __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1230   __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1231 }
1232 
1233 
1234 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1235                                 int shift, int offset) {
1236   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1237   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1238   __ vmovdqu(xmm, Address(src, index, scale, offset));
1239   __ vmovdqu(Address(dst, index, scale, offset), xmm);
1240 }
1241 
1242 
1243 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1244                                 bool conjoint, int shift, int offset, bool use64byteVector) {
1245   assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1246   if (!use64byteVector) {
1247     if (conjoint) {
1248       copy32_avx(dst, src, index, xmm, shift, offset+32);
1249       copy32_avx(dst, src, index, xmm, shift, offset);
1250     } else {
1251       copy32_avx(dst, src, index, xmm, shift, offset);
1252       copy32_avx(dst, src, index, xmm, shift, offset+32);
1253     }
1254   } else {
1255     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1256     __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1257     __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1258   }
1259 }
1260 
1261 #endif // COMPILER2_OR_JVMCI
1262 
1263 
1264 // Arguments:
1265 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1266 //             ignored
1267 //   name    - stub name string
1268 //
1269 // Inputs:
1270 //   c_rarg0   - source array address
1271 //   c_rarg1   - destination array address
1272 //   c_rarg2   - element count, treated as ssize_t, can be zero
1273 //
1274 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1275 // we let the hardware handle it.  The one to eight bytes within words,
1276 // dwords or qwords that span cache line boundaries will still be loaded
1277 // and stored atomically.
1278 //
1279 // Side Effects:
1280 //   disjoint_byte_copy_entry is set to the no-overlap entry point
1281 //   used by generate_conjoint_byte_copy().
1282 //
1283 address StubGenerator::generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1284 #if COMPILER2_OR_JVMCI
1285   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1286      return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1287                                                aligned, false, false);
1288   }
1289 #endif
1290   __ align(CodeEntryAlignment);
1291   StubCodeMark mark(this, "StubRoutines", name);
1292   address start = __ pc();
1293   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1294 
1295   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1296   Label L_copy_byte, L_exit;
1297   const Register from        = rdi;  // source array address
1298   const Register to          = rsi;  // destination array address
1299   const Register count       = rdx;  // elements count
1300   const Register byte_count  = rcx;
1301   const Register qword_count = count;
1302   const Register end_from    = from; // source array end address
1303   const Register end_to      = to;   // destination array end address
1304   // End pointers are inclusive, and if count is not zero they point
1305   // to the last unit copied:  end_to[0] := end_from[0]
1306 
1307   __ enter(); // required for proper stackwalking of RuntimeStub frame
1308   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1309 
1310   if (entry != nullptr) {
1311     *entry = __ pc();
1312      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1313     BLOCK_COMMENT("Entry:");
1314   }
1315 
1316   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1317                     // r9 and r10 may be used to save non-volatile registers
1318 
1319   {
1320     // UnsafeMemoryAccess page error: continue after unsafe access
1321     UnsafeMemoryAccessMark umam(this, !aligned, true);
1322     // 'from', 'to' and 'count' are now valid
1323     __ movptr(byte_count, count);
1324     __ shrptr(count, 3); // count => qword_count
1325 
1326     // Copy from low to high addresses.  Use 'to' as scratch.
1327     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1328     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1329     __ negptr(qword_count); // make the count negative
1330     __ jmp(L_copy_bytes);
1331 
1332     // Copy trailing qwords
1333   __ BIND(L_copy_8_bytes);
1334     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1335     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1336     __ increment(qword_count);
1337     __ jcc(Assembler::notZero, L_copy_8_bytes);
1338 
1339     // Check for and copy trailing dword
1340   __ BIND(L_copy_4_bytes);
1341     __ testl(byte_count, 4);
1342     __ jccb(Assembler::zero, L_copy_2_bytes);
1343     __ movl(rax, Address(end_from, 8));
1344     __ movl(Address(end_to, 8), rax);
1345 
1346     __ addptr(end_from, 4);
1347     __ addptr(end_to, 4);
1348 
1349     // Check for and copy trailing word
1350   __ BIND(L_copy_2_bytes);
1351     __ testl(byte_count, 2);
1352     __ jccb(Assembler::zero, L_copy_byte);
1353     __ movw(rax, Address(end_from, 8));
1354     __ movw(Address(end_to, 8), rax);
1355 
1356     __ addptr(end_from, 2);
1357     __ addptr(end_to, 2);
1358 
1359     // Check for and copy trailing byte
1360   __ BIND(L_copy_byte);
1361     __ testl(byte_count, 1);
1362     __ jccb(Assembler::zero, L_exit);
1363     __ movb(rax, Address(end_from, 8));
1364     __ movb(Address(end_to, 8), rax);
1365   }
1366 __ BIND(L_exit);
1367   address ucme_exit_pc = __ pc();
1368   restore_arg_regs();
1369   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1370   __ xorptr(rax, rax); // return 0
1371   __ vzeroupper();
1372   __ leave(); // required for proper stackwalking of RuntimeStub frame
1373   __ ret(0);
1374 
1375   {
1376     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1377     // Copy in multi-bytes chunks
1378     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1379     __ jmp(L_copy_4_bytes);
1380   }
1381   return start;
1382 }
1383 
1384 
1385 // Arguments:
1386 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1387 //             ignored
1388 //   name    - stub name string
1389 //
1390 // Inputs:
1391 //   c_rarg0   - source array address
1392 //   c_rarg1   - destination array address
1393 //   c_rarg2   - element count, treated as ssize_t, can be zero
1394 //
1395 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1396 // we let the hardware handle it.  The one to eight bytes within words,
1397 // dwords or qwords that span cache line boundaries will still be loaded
1398 // and stored atomically.
1399 //
1400 address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1401                                                    address* entry, const char *name) {
1402 #if COMPILER2_OR_JVMCI
1403   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1404      return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1405                                                nooverlap_target, aligned, false, false);
1406   }
1407 #endif
1408   __ align(CodeEntryAlignment);
1409   StubCodeMark mark(this, "StubRoutines", name);
1410   address start = __ pc();
1411   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1412 
1413   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1414   const Register from        = rdi;  // source array address
1415   const Register to          = rsi;  // destination array address
1416   const Register count       = rdx;  // elements count
1417   const Register byte_count  = rcx;
1418   const Register qword_count = count;
1419 
1420   __ enter(); // required for proper stackwalking of RuntimeStub frame
1421   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1422 
1423   if (entry != nullptr) {
1424     *entry = __ pc();
1425     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1426     BLOCK_COMMENT("Entry:");
1427   }
1428 
1429   array_overlap_test(nooverlap_target, Address::times_1);
1430   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1431                     // r9 and r10 may be used to save non-volatile registers
1432 
1433   {
1434     // UnsafeMemoryAccess page error: continue after unsafe access
1435     UnsafeMemoryAccessMark umam(this, !aligned, true);
1436     // 'from', 'to' and 'count' are now valid
1437     __ movptr(byte_count, count);
1438     __ shrptr(count, 3);   // count => qword_count
1439 
1440     // Copy from high to low addresses.
1441 
1442     // Check for and copy trailing byte
1443     __ testl(byte_count, 1);
1444     __ jcc(Assembler::zero, L_copy_2_bytes);
1445     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1446     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1447     __ decrement(byte_count); // Adjust for possible trailing word
1448 
1449     // Check for and copy trailing word
1450   __ BIND(L_copy_2_bytes);
1451     __ testl(byte_count, 2);
1452     __ jcc(Assembler::zero, L_copy_4_bytes);
1453     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1454     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1455 
1456     // Check for and copy trailing dword
1457   __ BIND(L_copy_4_bytes);
1458     __ testl(byte_count, 4);
1459     __ jcc(Assembler::zero, L_copy_bytes);
1460     __ movl(rax, Address(from, qword_count, Address::times_8));
1461     __ movl(Address(to, qword_count, Address::times_8), rax);
1462     __ jmp(L_copy_bytes);
1463 
1464     // Copy trailing qwords
1465   __ BIND(L_copy_8_bytes);
1466     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1467     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1468     __ decrement(qword_count);
1469     __ jcc(Assembler::notZero, L_copy_8_bytes);
1470   }
1471   restore_arg_regs();
1472   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1473   __ xorptr(rax, rax); // return 0
1474   __ vzeroupper();
1475   __ leave(); // required for proper stackwalking of RuntimeStub frame
1476   __ ret(0);
1477 
1478   {
1479     // UnsafeMemoryAccess page error: continue after unsafe access
1480     UnsafeMemoryAccessMark umam(this, !aligned, true);
1481     // Copy in multi-bytes chunks
1482     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1483   }
1484   restore_arg_regs();
1485   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1486   __ xorptr(rax, rax); // return 0
1487   __ vzeroupper();
1488   __ leave(); // required for proper stackwalking of RuntimeStub frame
1489   __ ret(0);
1490 
1491   return start;
1492 }
1493 
1494 
1495 // Arguments:
1496 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1497 //             ignored
1498 //   name    - stub name string
1499 //
1500 // Inputs:
1501 //   c_rarg0   - source array address
1502 //   c_rarg1   - destination array address
1503 //   c_rarg2   - element count, treated as ssize_t, can be zero
1504 //
1505 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1506 // let the hardware handle it.  The two or four words within dwords
1507 // or qwords that span cache line boundaries will still be loaded
1508 // and stored atomically.
1509 //
1510 // Side Effects:
1511 //   disjoint_short_copy_entry is set to the no-overlap entry point
1512 //   used by generate_conjoint_short_copy().
1513 //
1514 address StubGenerator::generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1515 #if COMPILER2_OR_JVMCI
1516   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1517      return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
1518                                                aligned, false, false);
1519   }
1520 #endif
1521 
1522   __ align(CodeEntryAlignment);
1523   StubCodeMark mark(this, "StubRoutines", name);
1524   address start = __ pc();
1525   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1526 
1527   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1528   const Register from        = rdi;  // source array address
1529   const Register to          = rsi;  // destination array address
1530   const Register count       = rdx;  // elements count
1531   const Register word_count  = rcx;
1532   const Register qword_count = count;
1533   const Register end_from    = from; // source array end address
1534   const Register end_to      = to;   // destination array end address
1535   // End pointers are inclusive, and if count is not zero they point
1536   // to the last unit copied:  end_to[0] := end_from[0]
1537 
1538   __ enter(); // required for proper stackwalking of RuntimeStub frame
1539   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1540 
1541   if (entry != nullptr) {
1542     *entry = __ pc();
1543     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1544     BLOCK_COMMENT("Entry:");
1545   }
1546 
1547   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1548                     // r9 and r10 may be used to save non-volatile registers
1549 
1550   {
1551     // UnsafeMemoryAccess page error: continue after unsafe access
1552     UnsafeMemoryAccessMark umam(this, !aligned, true);
1553     // 'from', 'to' and 'count' are now valid
1554     __ movptr(word_count, count);
1555     __ shrptr(count, 2); // count => qword_count
1556 
1557     // Copy from low to high addresses.  Use 'to' as scratch.
1558     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1559     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1560     __ negptr(qword_count);
1561     __ jmp(L_copy_bytes);
1562 
1563     // Copy trailing qwords
1564   __ BIND(L_copy_8_bytes);
1565     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1566     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1567     __ increment(qword_count);
1568     __ jcc(Assembler::notZero, L_copy_8_bytes);
1569 
1570     // Original 'dest' is trashed, so we can't use it as a
1571     // base register for a possible trailing word copy
1572 
1573     // Check for and copy trailing dword
1574   __ BIND(L_copy_4_bytes);
1575     __ testl(word_count, 2);
1576     __ jccb(Assembler::zero, L_copy_2_bytes);
1577     __ movl(rax, Address(end_from, 8));
1578     __ movl(Address(end_to, 8), rax);
1579 
1580     __ addptr(end_from, 4);
1581     __ addptr(end_to, 4);
1582 
1583     // Check for and copy trailing word
1584   __ BIND(L_copy_2_bytes);
1585     __ testl(word_count, 1);
1586     __ jccb(Assembler::zero, L_exit);
1587     __ movw(rax, Address(end_from, 8));
1588     __ movw(Address(end_to, 8), rax);
1589   }
1590 __ BIND(L_exit);
1591   address ucme_exit_pc = __ pc();
1592   restore_arg_regs();
1593   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1594   __ xorptr(rax, rax); // return 0
1595   __ vzeroupper();
1596   __ leave(); // required for proper stackwalking of RuntimeStub frame
1597   __ ret(0);
1598 
1599   {
1600     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1601     // Copy in multi-bytes chunks
1602     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1603     __ jmp(L_copy_4_bytes);
1604   }
1605 
1606   return start;
1607 }
1608 
1609 
1610 address StubGenerator::generate_fill(BasicType t, bool aligned, const char *name) {
1611   __ align(CodeEntryAlignment);
1612   StubCodeMark mark(this, "StubRoutines", name);
1613   address start = __ pc();
1614 
1615   BLOCK_COMMENT("Entry:");
1616 
1617   const Register to       = c_rarg0;  // destination array address
1618   const Register value    = c_rarg1;  // value
1619   const Register count    = c_rarg2;  // elements count
1620   __ mov(r11, count);
1621 
1622   __ enter(); // required for proper stackwalking of RuntimeStub frame
1623 
1624   {
1625     // Add set memory mark to protect against unsafe accesses faulting
1626     UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true);
1627     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1628   }
1629 
1630   __ vzeroupper();
1631   __ leave(); // required for proper stackwalking of RuntimeStub frame
1632   __ ret(0);
1633 
1634   return start;
1635 }
1636 
1637 
1638 // Arguments:
1639 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1640 //             ignored
1641 //   name    - stub name string
1642 //
1643 // Inputs:
1644 //   c_rarg0   - source array address
1645 //   c_rarg1   - destination array address
1646 //   c_rarg2   - element count, treated as ssize_t, can be zero
1647 //
1648 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1649 // let the hardware handle it.  The two or four words within dwords
1650 // or qwords that span cache line boundaries will still be loaded
1651 // and stored atomically.
1652 //
1653 address StubGenerator::generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1654                                                     address *entry, const char *name) {
1655 #if COMPILER2_OR_JVMCI
1656   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1657      return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
1658                                                nooverlap_target, aligned, false, false);
1659   }
1660 #endif
1661   __ align(CodeEntryAlignment);
1662   StubCodeMark mark(this, "StubRoutines", name);
1663   address start = __ pc();
1664   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1665 
1666   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1667   const Register from        = rdi;  // source array address
1668   const Register to          = rsi;  // destination array address
1669   const Register count       = rdx;  // elements count
1670   const Register word_count  = rcx;
1671   const Register qword_count = count;
1672 
1673   __ enter(); // required for proper stackwalking of RuntimeStub frame
1674   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1675 
1676   if (entry != nullptr) {
1677     *entry = __ pc();
1678     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1679     BLOCK_COMMENT("Entry:");
1680   }
1681 
1682   array_overlap_test(nooverlap_target, Address::times_2);
1683   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1684                     // r9 and r10 may be used to save non-volatile registers
1685 
1686   {
1687     // UnsafeMemoryAccess page error: continue after unsafe access
1688     UnsafeMemoryAccessMark umam(this, !aligned, true);
1689     // 'from', 'to' and 'count' are now valid
1690     __ movptr(word_count, count);
1691     __ shrptr(count, 2); // count => qword_count
1692 
1693     // Copy from high to low addresses.  Use 'to' as scratch.
1694 
1695     // Check for and copy trailing word
1696     __ testl(word_count, 1);
1697     __ jccb(Assembler::zero, L_copy_4_bytes);
1698     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1699     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1700 
1701    // Check for and copy trailing dword
1702   __ BIND(L_copy_4_bytes);
1703     __ testl(word_count, 2);
1704     __ jcc(Assembler::zero, L_copy_bytes);
1705     __ movl(rax, Address(from, qword_count, Address::times_8));
1706     __ movl(Address(to, qword_count, Address::times_8), rax);
1707     __ jmp(L_copy_bytes);
1708 
1709     // Copy trailing qwords
1710   __ BIND(L_copy_8_bytes);
1711     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1712     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1713     __ decrement(qword_count);
1714     __ jcc(Assembler::notZero, L_copy_8_bytes);
1715   }
1716   restore_arg_regs();
1717   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1718   __ xorptr(rax, rax); // return 0
1719   __ vzeroupper();
1720   __ leave(); // required for proper stackwalking of RuntimeStub frame
1721   __ ret(0);
1722 
1723   {
1724     // UnsafeMemoryAccess page error: continue after unsafe access
1725     UnsafeMemoryAccessMark umam(this, !aligned, true);
1726     // Copy in multi-bytes chunks
1727     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1728   }
1729   restore_arg_regs();
1730   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1731   __ xorptr(rax, rax); // return 0
1732   __ vzeroupper();
1733   __ leave(); // required for proper stackwalking of RuntimeStub frame
1734   __ ret(0);
1735 
1736   return start;
1737 }
1738 
1739 
1740 // Arguments:
1741 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1742 //             ignored
1743 //   is_oop  - true => oop array, so generate store check code
1744 //   name    - stub name string
1745 //
1746 // Inputs:
1747 //   c_rarg0   - source array address
1748 //   c_rarg1   - destination array address
1749 //   c_rarg2   - element count, treated as ssize_t, can be zero
1750 //
1751 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1752 // the hardware handle it.  The two dwords within qwords that span
1753 // cache line boundaries will still be loaded and stored atomically.
1754 //
1755 // Side Effects:
1756 //   disjoint_int_copy_entry is set to the no-overlap entry point
1757 //   used by generate_conjoint_int_oop_copy().
1758 //
1759 address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1760                                                       const char *name, bool dest_uninitialized) {
1761   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1762 #if COMPILER2_OR_JVMCI
1763   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1764      return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
1765                                                aligned, is_oop, dest_uninitialized);
1766   }
1767 #endif
1768 
1769   __ align(CodeEntryAlignment);
1770   StubCodeMark mark(this, "StubRoutines", name);
1771   address start = __ pc();
1772 
1773   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1774   const Register from        = rdi;  // source array address
1775   const Register to          = rsi;  // destination array address
1776   const Register count       = rdx;  // elements count
1777   const Register dword_count = rcx;
1778   const Register qword_count = count;
1779   const Register end_from    = from; // source array end address
1780   const Register end_to      = to;   // destination array end address
1781   // End pointers are inclusive, and if count is not zero they point
1782   // to the last unit copied:  end_to[0] := end_from[0]
1783 
1784   __ enter(); // required for proper stackwalking of RuntimeStub frame
1785   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1786 
1787   if (entry != nullptr) {
1788     *entry = __ pc();
1789     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1790     BLOCK_COMMENT("Entry:");
1791   }
1792 
1793   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1794                                  // r9 is used to save r15_thread
1795 
1796   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1797   if (dest_uninitialized) {
1798     decorators |= IS_DEST_UNINITIALIZED;
1799   }
1800   if (aligned) {
1801     decorators |= ARRAYCOPY_ALIGNED;
1802   }
1803 
1804   BasicType type = is_oop ? T_OBJECT : T_INT;
1805   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1806 
1807   {
1808     // UnsafeMemoryAccess page error: continue after unsafe access
1809     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1810     // 'from', 'to' and 'count' are now valid
1811     __ movptr(dword_count, count);
1812     __ shrptr(count, 1); // count => qword_count
1813 
1814     // Copy from low to high addresses.  Use 'to' as scratch.
1815     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1816     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1817     __ negptr(qword_count);
1818     __ jmp(L_copy_bytes);
1819 
1820     // Copy trailing qwords
1821   __ BIND(L_copy_8_bytes);
1822     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1823     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1824     __ increment(qword_count);
1825     __ jcc(Assembler::notZero, L_copy_8_bytes);
1826 
1827     // Check for and copy trailing dword
1828   __ BIND(L_copy_4_bytes);
1829     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1830     __ jccb(Assembler::zero, L_exit);
1831     __ movl(rax, Address(end_from, 8));
1832     __ movl(Address(end_to, 8), rax);
1833   }
1834 __ BIND(L_exit);
1835   address ucme_exit_pc = __ pc();
1836   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1837   restore_arg_regs_using_thread();
1838   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1839   __ vzeroupper();
1840   __ xorptr(rax, rax); // return 0
1841   __ leave(); // required for proper stackwalking of RuntimeStub frame
1842   __ ret(0);
1843 
1844   {
1845     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
1846     // Copy in multi-bytes chunks
1847     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
1848     __ jmp(L_copy_4_bytes);
1849   }
1850 
1851   return start;
1852 }
1853 
1854 
1855 // Arguments:
1856 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1857 //             ignored
1858 //   is_oop  - true => oop array, so generate store check code
1859 //   name    - stub name string
1860 //
1861 // Inputs:
1862 //   c_rarg0   - source array address
1863 //   c_rarg1   - destination array address
1864 //   c_rarg2   - element count, treated as ssize_t, can be zero
1865 //
1866 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1867 // the hardware handle it.  The two dwords within qwords that span
1868 // cache line boundaries will still be loaded and stored atomically.
1869 //
1870 address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1871                                                       address *entry, const char *name,
1872                                                       bool dest_uninitialized) {
1873   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1874 #if COMPILER2_OR_JVMCI
1875   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1876      return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
1877                                                nooverlap_target, aligned, is_oop, dest_uninitialized);
1878   }
1879 #endif
1880   __ align(CodeEntryAlignment);
1881   StubCodeMark mark(this, "StubRoutines", name);
1882   address start = __ pc();
1883 
1884   Label L_copy_bytes, L_copy_8_bytes, L_exit;
1885   const Register from        = rdi;  // source array address
1886   const Register to          = rsi;  // destination array address
1887   const Register count       = rdx;  // elements count
1888   const Register dword_count = rcx;
1889   const Register qword_count = count;
1890 
1891   __ enter(); // required for proper stackwalking of RuntimeStub frame
1892   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1893 
1894   if (entry != nullptr) {
1895     *entry = __ pc();
1896      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1897     BLOCK_COMMENT("Entry:");
1898   }
1899 
1900   array_overlap_test(nooverlap_target, Address::times_4);
1901   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1902                                  // r9 is used to save r15_thread
1903 
1904   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1905   if (dest_uninitialized) {
1906     decorators |= IS_DEST_UNINITIALIZED;
1907   }
1908   if (aligned) {
1909     decorators |= ARRAYCOPY_ALIGNED;
1910   }
1911 
1912   BasicType type = is_oop ? T_OBJECT : T_INT;
1913   // no registers are destroyed by this call
1914   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1915 
1916   assert_clean_int(count, rax); // Make sure 'count' is clean int.
1917   {
1918     // UnsafeMemoryAccess page error: continue after unsafe access
1919     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1920     // 'from', 'to' and 'count' are now valid
1921     __ movptr(dword_count, count);
1922     __ shrptr(count, 1); // count => qword_count
1923 
1924     // Copy from high to low addresses.  Use 'to' as scratch.
1925 
1926     // Check for and copy trailing dword
1927     __ testl(dword_count, 1);
1928     __ jcc(Assembler::zero, L_copy_bytes);
1929     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1930     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1931     __ jmp(L_copy_bytes);
1932 
1933     // Copy trailing qwords
1934   __ BIND(L_copy_8_bytes);
1935     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1936     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1937     __ decrement(qword_count);
1938     __ jcc(Assembler::notZero, L_copy_8_bytes);
1939   }
1940   if (is_oop) {
1941     __ jmp(L_exit);
1942   }
1943   restore_arg_regs_using_thread();
1944   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1945   __ xorptr(rax, rax); // return 0
1946   __ vzeroupper();
1947   __ leave(); // required for proper stackwalking of RuntimeStub frame
1948   __ ret(0);
1949 
1950   {
1951     // UnsafeMemoryAccess page error: continue after unsafe access
1952     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1953     // Copy in multi-bytes chunks
1954     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
1955   }
1956 
1957 __ BIND(L_exit);
1958   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1959   restore_arg_regs_using_thread();
1960   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1961   __ xorptr(rax, rax); // return 0
1962   __ vzeroupper();
1963   __ leave(); // required for proper stackwalking of RuntimeStub frame
1964   __ ret(0);
1965 
1966   return start;
1967 }
1968 
1969 
1970 // Arguments:
1971 //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1972 //             ignored
1973 //   is_oop  - true => oop array, so generate store check code
1974 //   name    - stub name string
1975 //
1976 // Inputs:
1977 //   c_rarg0   - source array address
1978 //   c_rarg1   - destination array address
1979 //   c_rarg2   - element count, treated as ssize_t, can be zero
1980 //
1981  // Side Effects:
1982 //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1983 //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1984 //
1985 address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
1986                                                        const char *name, bool dest_uninitialized) {
1987   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1988 #if COMPILER2_OR_JVMCI
1989   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1990      return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
1991                                                aligned, is_oop, dest_uninitialized);
1992   }
1993 #endif
1994   __ align(CodeEntryAlignment);
1995   StubCodeMark mark(this, "StubRoutines", name);
1996   address start = __ pc();
1997 
1998   Label L_copy_bytes, L_copy_8_bytes, L_exit;
1999   const Register from        = rdi;  // source array address
2000   const Register to          = rsi;  // destination array address
2001   const Register qword_count = rdx;  // elements count
2002   const Register end_from    = from; // source array end address
2003   const Register end_to      = rcx;  // destination array end address
2004   const Register saved_count = r11;
2005   // End pointers are inclusive, and if count is not zero they point
2006   // to the last unit copied:  end_to[0] := end_from[0]
2007 
2008   __ enter(); // required for proper stackwalking of RuntimeStub frame
2009   // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2010   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2011 
2012   if (entry != nullptr) {
2013     *entry = __ pc();
2014     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2015     BLOCK_COMMENT("Entry:");
2016   }
2017 
2018   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2019                                    // r9 is used to save r15_thread
2020   // 'from', 'to' and 'qword_count' are now valid
2021 
2022   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2023   if (dest_uninitialized) {
2024     decorators |= IS_DEST_UNINITIALIZED;
2025   }
2026   if (aligned) {
2027     decorators |= ARRAYCOPY_ALIGNED;
2028   }
2029 
2030   BasicType type = is_oop ? T_OBJECT : T_LONG;
2031   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2032   {
2033     // UnsafeMemoryAccess page error: continue after unsafe access
2034     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2035 
2036     // Copy from low to high addresses.  Use 'to' as scratch.
2037     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2038     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2039     __ negptr(qword_count);
2040     __ jmp(L_copy_bytes);
2041 
2042     // Copy trailing qwords
2043   __ BIND(L_copy_8_bytes);
2044     bs->copy_load_at(_masm, decorators, type, 8,
2045                      rax, Address(end_from, qword_count, Address::times_8, 8),
2046                      r10);
2047     bs->copy_store_at(_masm, decorators, type, 8,
2048                       Address(end_to, qword_count, Address::times_8, 8), rax,
2049                       r10);
2050     __ increment(qword_count);
2051     __ jcc(Assembler::notZero, L_copy_8_bytes);
2052   }
2053   if (is_oop) {
2054     __ jmp(L_exit);
2055   } else {
2056     restore_arg_regs_using_thread();
2057     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2058     __ xorptr(rax, rax); // return 0
2059     __ vzeroupper();
2060     __ leave(); // required for proper stackwalking of RuntimeStub frame
2061     __ ret(0);
2062   }
2063 
2064   {
2065     // UnsafeMemoryAccess page error: continue after unsafe access
2066     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2067     // Copy in multi-bytes chunks
2068     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2069   }
2070 
2071   __ BIND(L_exit);
2072   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2073   restore_arg_regs_using_thread();
2074   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2075                           SharedRuntime::_jlong_array_copy_ctr,
2076                  rscratch1); // Update counter after rscratch1 is free
2077   __ vzeroupper();
2078   __ xorptr(rax, rax); // return 0
2079   __ leave(); // required for proper stackwalking of RuntimeStub frame
2080   __ ret(0);
2081 
2082   return start;
2083 }
2084 
2085 
2086 // Arguments:
2087 //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2088 //             ignored
2089 //   is_oop  - true => oop array, so generate store check code
2090 //   name    - stub name string
2091 //
2092 // Inputs:
2093 //   c_rarg0   - source array address
2094 //   c_rarg1   - destination array address
2095 //   c_rarg2   - element count, treated as ssize_t, can be zero
2096 //
2097 address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2098                                                        address *entry, const char *name,
2099                                                        bool dest_uninitialized) {
2100   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2101 #if COMPILER2_OR_JVMCI
2102   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2103      return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2104                                                nooverlap_target, aligned, is_oop, dest_uninitialized);
2105   }
2106 #endif
2107   __ align(CodeEntryAlignment);
2108   StubCodeMark mark(this, "StubRoutines", name);
2109   address start = __ pc();
2110 
2111   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2112   const Register from        = rdi;  // source array address
2113   const Register to          = rsi;  // destination array address
2114   const Register qword_count = rdx;  // elements count
2115   const Register saved_count = rcx;
2116 
2117   __ enter(); // required for proper stackwalking of RuntimeStub frame
2118   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2119 
2120   if (entry != nullptr) {
2121     *entry = __ pc();
2122     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2123     BLOCK_COMMENT("Entry:");
2124   }
2125 
2126   array_overlap_test(nooverlap_target, Address::times_8);
2127   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2128                                  // r9 is used to save r15_thread
2129   // 'from', 'to' and 'qword_count' are now valid
2130 
2131   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2132   if (dest_uninitialized) {
2133     decorators |= IS_DEST_UNINITIALIZED;
2134   }
2135   if (aligned) {
2136     decorators |= ARRAYCOPY_ALIGNED;
2137   }
2138 
2139   BasicType type = is_oop ? T_OBJECT : T_LONG;
2140   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2141   {
2142     // UnsafeMemoryAccess page error: continue after unsafe access
2143     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2144 
2145     __ jmp(L_copy_bytes);
2146 
2147     // Copy trailing qwords
2148   __ BIND(L_copy_8_bytes);
2149     bs->copy_load_at(_masm, decorators, type, 8,
2150                      rax, Address(from, qword_count, Address::times_8, -8),
2151                      r10);
2152     bs->copy_store_at(_masm, decorators, type, 8,
2153                       Address(to, qword_count, Address::times_8, -8), rax,
2154                       r10);
2155     __ decrement(qword_count);
2156     __ jcc(Assembler::notZero, L_copy_8_bytes);
2157   }
2158   if (is_oop) {
2159     __ jmp(L_exit);
2160   } else {
2161     restore_arg_regs_using_thread();
2162     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2163     __ xorptr(rax, rax); // return 0
2164     __ vzeroupper();
2165     __ leave(); // required for proper stackwalking of RuntimeStub frame
2166     __ ret(0);
2167   }
2168   {
2169     // UnsafeMemoryAccess page error: continue after unsafe access
2170     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2171 
2172     // Copy in multi-bytes chunks
2173     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2174   }
2175   __ BIND(L_exit);
2176   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2177   restore_arg_regs_using_thread();
2178   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2179                           SharedRuntime::_jlong_array_copy_ctr,
2180                  rscratch1); // Update counter after rscratch1 is free
2181   __ vzeroupper();
2182   __ xorptr(rax, rax); // return 0
2183   __ leave(); // required for proper stackwalking of RuntimeStub frame
2184   __ ret(0);
2185 
2186   return start;
2187 }
2188 
2189 
2190 // Helper for generating a dynamic type check.
2191 // Smashes no registers.
2192 void StubGenerator::generate_type_check(Register sub_klass,
2193                                         Register super_check_offset,
2194                                         Register super_klass,
2195                                         Label& L_success) {
2196   assert_different_registers(sub_klass, super_check_offset, super_klass);
2197 
2198   BLOCK_COMMENT("type_check:");
2199 
2200   Label L_miss;
2201 
2202   __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
2203                                    super_check_offset);
2204   __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2205 
2206   // Fall through on failure!
2207   __ BIND(L_miss);
2208 }
2209 
2210 //
2211 //  Generate checkcasting array copy stub
2212 //
2213 //  Input:
2214 //    c_rarg0   - source array address
2215 //    c_rarg1   - destination array address
2216 //    c_rarg2   - element count, treated as ssize_t, can be zero
2217 //    c_rarg3   - size_t ckoff (super_check_offset)
2218 // not Win64
2219 //    c_rarg4   - oop ckval (super_klass)
2220 // Win64
2221 //    rsp+40    - oop ckval (super_klass)
2222 //
2223 //  Output:
2224 //    rax ==  0  -  success
2225 //    rax == -1^K - failure, where K is partial transfer count
2226 //
2227 address StubGenerator::generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized) {
2228 
2229   Label L_load_element, L_store_element, L_do_card_marks, L_done;
2230 
2231   // Input registers (after setup_arg_regs)
2232   const Register from        = rdi;   // source array address
2233   const Register to          = rsi;   // destination array address
2234   const Register length      = rdx;   // elements count
2235   const Register ckoff       = rcx;   // super_check_offset
2236   const Register ckval       = r8;    // super_klass
2237 
2238   // Registers used as temps (r13, r14 are save-on-entry)
2239   const Register end_from    = from;  // source array end address
2240   const Register end_to      = r13;   // destination array end address
2241   const Register count       = rdx;   // -(count_remaining)
2242   const Register r14_length  = r14;   // saved copy of length
2243   // End pointers are inclusive, and if length is not zero they point
2244   // to the last unit copied:  end_to[0] := end_from[0]
2245 
2246   const Register rax_oop    = rax;    // actual oop copied
2247   const Register r11_klass  = r11;    // oop._klass
2248 
2249   //---------------------------------------------------------------
2250   // Assembler stub will be used for this call to arraycopy
2251   // if the two arrays are subtypes of Object[] but the
2252   // destination array type is not equal to or a supertype
2253   // of the source type.  Each element must be separately
2254   // checked.
2255 
2256   __ align(CodeEntryAlignment);
2257   StubCodeMark mark(this, "StubRoutines", name);
2258   address start = __ pc();
2259 
2260   __ enter(); // required for proper stackwalking of RuntimeStub frame
2261 
2262 #ifdef ASSERT
2263   // caller guarantees that the arrays really are different
2264   // otherwise, we would have to make conjoint checks
2265   { Label L;
2266     array_overlap_test(L, TIMES_OOP);
2267     __ stop("checkcast_copy within a single array");
2268     __ bind(L);
2269   }
2270 #endif //ASSERT
2271 
2272   setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2273                                   // ckoff => rcx, ckval => r8
2274                                   // r9 is used to save r15_thread
2275 #ifdef _WIN64
2276   // last argument (#4) is on stack on Win64
2277   __ movptr(ckval, Address(rsp, 6 * wordSize));
2278 #endif
2279 
2280   // Caller of this entry point must set up the argument registers.
2281   if (entry != nullptr) {
2282     *entry = __ pc();
2283     BLOCK_COMMENT("Entry:");
2284   }
2285 
2286   // allocate spill slots for r13, r14
2287   enum {
2288     saved_r13_offset,
2289     saved_r14_offset,
2290     saved_r10_offset,
2291     saved_rbp_offset
2292   };
2293   __ subptr(rsp, saved_rbp_offset * wordSize);
2294   __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2295   __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2296   __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2297 
2298 #ifdef ASSERT
2299     Label L2;
2300     __ get_thread(r14);
2301     __ cmpptr(r15_thread, r14);
2302     __ jcc(Assembler::equal, L2);
2303     __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2304     __ bind(L2);
2305 #endif // ASSERT
2306 
2307   // check that int operands are properly extended to size_t
2308   assert_clean_int(length, rax);
2309   assert_clean_int(ckoff, rax);
2310 
2311 #ifdef ASSERT
2312   BLOCK_COMMENT("assert consistent ckoff/ckval");
2313   // The ckoff and ckval must be mutually consistent,
2314   // even though caller generates both.
2315   { Label L;
2316     int sco_offset = in_bytes(Klass::super_check_offset_offset());
2317     __ cmpl(ckoff, Address(ckval, sco_offset));
2318     __ jcc(Assembler::equal, L);
2319     __ stop("super_check_offset inconsistent");
2320     __ bind(L);
2321   }
2322 #endif //ASSERT
2323 
2324   // Loop-invariant addresses.  They are exclusive end pointers.
2325   Address end_from_addr(from, length, TIMES_OOP, 0);
2326   Address   end_to_addr(to,   length, TIMES_OOP, 0);
2327   // Loop-variant addresses.  They assume post-incremented count < 0.
2328   Address from_element_addr(end_from, count, TIMES_OOP, 0);
2329   Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2330 
2331   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2332   if (dest_uninitialized) {
2333     decorators |= IS_DEST_UNINITIALIZED;
2334   }
2335 
2336   BasicType type = T_OBJECT;
2337   size_t element_size = UseCompressedOops ? 4 : 8;
2338 
2339   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2340   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2341 
2342   // Copy from low to high addresses, indexed from the end of each array.
2343   __ lea(end_from, end_from_addr);
2344   __ lea(end_to,   end_to_addr);
2345   __ movptr(r14_length, length);        // save a copy of the length
2346   assert(length == count, "");          // else fix next line:
2347   __ negptr(count);                     // negate and test the length
2348   __ jcc(Assembler::notZero, L_load_element);
2349 
2350   // Empty array:  Nothing to do.
2351   __ xorptr(rax, rax);                  // return 0 on (trivial) success
2352   __ jmp(L_done);
2353 
2354   // ======== begin loop ========
2355   // (Loop is rotated; its entry is L_load_element.)
2356   // Loop control:
2357   //   for (count = -count; count != 0; count++)
2358   // Base pointers src, dst are biased by 8*(count-1),to last element.
2359   __ align(OptoLoopAlignment);
2360 
2361   __ BIND(L_store_element);
2362   bs->copy_store_at(_masm,
2363                     decorators,
2364                     type,
2365                     element_size,
2366                     to_element_addr,
2367                     rax_oop,
2368                     r10);
2369   __ increment(count);               // increment the count toward zero
2370   __ jcc(Assembler::zero, L_do_card_marks);
2371 
2372   // ======== loop entry is here ========
2373   __ BIND(L_load_element);
2374   bs->copy_load_at(_masm,
2375                    decorators,
2376                    type,
2377                    element_size,
2378                    rax_oop,
2379                    from_element_addr,
2380                    r10);
2381   __ testptr(rax_oop, rax_oop);
2382   __ jcc(Assembler::zero, L_store_element);
2383 
2384   __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2385   generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2386   // ======== end loop ========
2387 
2388   // It was a real error; we must depend on the caller to finish the job.
2389   // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2390   // Emit GC store barriers for the oops we have copied (r14 + rdx),
2391   // and report their number to the caller.
2392   assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2393   Label L_post_barrier;
2394   __ addptr(r14_length, count);     // K = (original - remaining) oops
2395   __ movptr(rax, r14_length);       // save the value
2396   __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2397   __ jccb(Assembler::notZero, L_post_barrier);
2398   __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2399 
2400   // Come here on success only.
2401   __ BIND(L_do_card_marks);
2402   __ xorptr(rax, rax);              // return 0 on success
2403 
2404   __ BIND(L_post_barrier);
2405   bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2406 
2407   // Common exit point (success or failure).
2408   __ BIND(L_done);
2409   __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2410   __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2411   __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2412   restore_arg_regs_using_thread();
2413   INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2414   __ leave(); // required for proper stackwalking of RuntimeStub frame
2415   __ ret(0);
2416 
2417   return start;
2418 }
2419 
2420 
2421 //  Generate 'unsafe' array copy stub
2422 //  Though just as safe as the other stubs, it takes an unscaled
2423 //  size_t argument instead of an element count.
2424 //
2425 //  Input:
2426 //    c_rarg0   - source array address
2427 //    c_rarg1   - destination array address
2428 //    c_rarg2   - byte count, treated as ssize_t, can be zero
2429 //
2430 // Examines the alignment of the operands and dispatches
2431 // to a long, int, short, or byte copy loop.
2432 //
2433 address StubGenerator::generate_unsafe_copy(const char *name,
2434                                             address byte_copy_entry, address short_copy_entry,
2435                                             address int_copy_entry, address long_copy_entry) {
2436 
2437   Label L_long_aligned, L_int_aligned, L_short_aligned;
2438 
2439   // Input registers (before setup_arg_regs)
2440   const Register from        = c_rarg0;  // source array address
2441   const Register to          = c_rarg1;  // destination array address
2442   const Register size        = c_rarg2;  // byte count (size_t)
2443 
2444   // Register used as a temp
2445   const Register bits        = rax;      // test copy of low bits
2446 
2447   __ align(CodeEntryAlignment);
2448   StubCodeMark mark(this, "StubRoutines", name);
2449   address start = __ pc();
2450 
2451   __ enter(); // required for proper stackwalking of RuntimeStub frame
2452 
2453   // bump this on entry, not on exit:
2454   INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
2455 
2456   __ mov(bits, from);
2457   __ orptr(bits, to);
2458   __ orptr(bits, size);
2459 
2460   __ testb(bits, BytesPerLong-1);
2461   __ jccb(Assembler::zero, L_long_aligned);
2462 
2463   __ testb(bits, BytesPerInt-1);
2464   __ jccb(Assembler::zero, L_int_aligned);
2465 
2466   __ testb(bits, BytesPerShort-1);
2467   __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2468 
2469   __ BIND(L_short_aligned);
2470   __ shrptr(size, LogBytesPerShort); // size => short_count
2471   __ jump(RuntimeAddress(short_copy_entry));
2472 
2473   __ BIND(L_int_aligned);
2474   __ shrptr(size, LogBytesPerInt); // size => int_count
2475   __ jump(RuntimeAddress(int_copy_entry));
2476 
2477   __ BIND(L_long_aligned);
2478   __ shrptr(size, LogBytesPerLong); // size => qword_count
2479   __ jump(RuntimeAddress(long_copy_entry));
2480 
2481   return start;
2482 }
2483 
2484 
2485 // Static enum for helper
2486 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
2487 // Helper for generate_unsafe_setmemory
2488 //
2489 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
2490 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
2491                                      Register size, Register wide_value,
2492                                      Register tmp, Label& L_exit,
2493                                      MacroAssembler *_masm) {
2494   Label L_Loop, L_Tail, L_TailLoop;
2495 
2496   int shiftval = 0;
2497   int incr = 0;
2498 
2499   switch (type) {
2500     case USM_SHORT:
2501       shiftval = 1;
2502       incr = 16;
2503       break;
2504     case USM_DWORD:
2505       shiftval = 2;
2506       incr = 32;
2507       break;
2508     case USM_QUADWORD:
2509       shiftval = 3;
2510       incr = 64;
2511       break;
2512   }
2513 
2514   // At this point, we know the lower bits of size are zero
2515   __ shrq(size, shiftval);
2516   // size now has number of X-byte chunks (2, 4 or 8)
2517 
2518   // Number of (8*X)-byte chunks into tmp
2519   __ movq(tmp, size);
2520   __ shrq(tmp, 3);
2521   __ jccb(Assembler::zero, L_Tail);
2522 
2523   __ BIND(L_Loop);
2524 
2525   // Unroll 8 stores
2526   for (int i = 0; i < 8; i++) {
2527     switch (type) {
2528       case USM_SHORT:
2529         __ movw(Address(dest, (2 * i)), wide_value);
2530         break;
2531       case USM_DWORD:
2532         __ movl(Address(dest, (4 * i)), wide_value);
2533         break;
2534       case USM_QUADWORD:
2535         __ movq(Address(dest, (8 * i)), wide_value);
2536         break;
2537     }
2538   }
2539   __ addq(dest, incr);
2540   __ decrementq(tmp);
2541   __ jccb(Assembler::notZero, L_Loop);
2542 
2543   __ BIND(L_Tail);
2544 
2545   // Find number of remaining X-byte chunks
2546   __ andq(size, 0x7);
2547 
2548   // If zero, then we're done
2549   __ jccb(Assembler::zero, L_exit);
2550 
2551   __ BIND(L_TailLoop);
2552 
2553     switch (type) {
2554       case USM_SHORT:
2555         __ movw(Address(dest, 0), wide_value);
2556         break;
2557       case USM_DWORD:
2558         __ movl(Address(dest, 0), wide_value);
2559         break;
2560       case USM_QUADWORD:
2561         __ movq(Address(dest, 0), wide_value);
2562         break;
2563     }
2564   __ addq(dest, incr >> 3);
2565   __ decrementq(size);
2566   __ jccb(Assembler::notZero, L_TailLoop);
2567 }
2568 
2569 //  Generate 'unsafe' set memory stub
2570 //  Though just as safe as the other stubs, it takes an unscaled
2571 //  size_t (# bytes) argument instead of an element count.
2572 //
2573 //  Input:
2574 //    c_rarg0   - destination array address
2575 //    c_rarg1   - byte count (size_t)
2576 //    c_rarg2   - byte value
2577 //
2578 // Examines the alignment of the operands and dispatches
2579 // to an int, short, or byte fill loop.
2580 //
2581 address StubGenerator::generate_unsafe_setmemory(const char *name,
2582                                                  address unsafe_byte_fill) {
2583   __ align(CodeEntryAlignment);
2584   StubCodeMark mark(this, "StubRoutines", name);
2585   address start = __ pc();
2586   __ enter();   // required for proper stackwalking of RuntimeStub frame
2587 
2588   assert(unsafe_byte_fill != nullptr, "Invalid call");
2589 
2590   // bump this on entry, not on exit:
2591   INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
2592 
2593   {
2594     Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
2595 
2596     const Register dest = c_rarg0;
2597     const Register size = c_rarg1;
2598     const Register byteVal = c_rarg2;
2599     const Register wide_value = rax;
2600     const Register rScratch1 = r10;
2601 
2602     assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
2603 
2604     //     fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2605 
2606     __ testq(size, size);
2607     __ jcc(Assembler::zero, L_exit);
2608 
2609     // Propagate byte to full Register
2610     __ movzbl(rScratch1, byteVal);
2611     __ mov64(wide_value, 0x0101010101010101ULL);
2612     __ imulq(wide_value, rScratch1);
2613 
2614     // Check for pointer & size alignment
2615     __ movq(rScratch1, dest);
2616     __ orq(rScratch1, size);
2617 
2618     __ testb(rScratch1, 7);
2619     __ jcc(Assembler::equal, L_fillQuadwords);
2620 
2621     __ testb(rScratch1, 3);
2622     __ jcc(Assembler::equal, L_fillDwords);
2623 
2624     __ testb(rScratch1, 1);
2625     __ jcc(Assembler::notEqual, L_fillBytes);
2626 
2627     // Fill words
2628     {
2629       UnsafeMemoryAccessMark umam(this, true, true);
2630 
2631       // At this point, we know the lower bit of size is zero and a
2632       // multiple of 2
2633       do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
2634                                L_exit, _masm);
2635     }
2636     __ jmpb(L_exit);
2637 
2638     __ BIND(L_fillQuadwords);
2639 
2640     // Fill QUADWORDs
2641     {
2642       UnsafeMemoryAccessMark umam(this, true, true);
2643 
2644       // At this point, we know the lower 3 bits of size are zero and a
2645       // multiple of 8
2646       do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
2647                                L_exit, _masm);
2648     }
2649     __ BIND(L_exit);
2650 
2651     __ leave();   // required for proper stackwalking of RuntimeStub frame
2652     __ ret(0);
2653 
2654     __ BIND(L_fillDwords);
2655 
2656     // Fill DWORDs
2657     {
2658       UnsafeMemoryAccessMark umam(this, true, true);
2659 
2660       // At this point, we know the lower 2 bits of size are zero and a
2661       // multiple of 4
2662       do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
2663                                L_exit, _masm);
2664     }
2665     __ jmpb(L_exit);
2666 
2667     __ BIND(L_fillBytes);
2668     // Set up for tail call to previously generated byte fill routine
2669     // Parameter order is (ptr, byteVal, size)
2670     __ xchgq(c_rarg1, c_rarg2);
2671     __ leave();    // Clear effect of enter()
2672     __ jump(RuntimeAddress(unsafe_byte_fill));
2673   }
2674 
2675   return start;
2676 }
2677 
2678 // Perform range checks on the proposed arraycopy.
2679 // Kills temp, but nothing else.
2680 // Also, clean the sign bits of src_pos and dst_pos.
2681 void StubGenerator::arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2682                                            Register src_pos, // source position (c_rarg1)
2683                                            Register dst,     // destination array oo (c_rarg2)
2684                                            Register dst_pos, // destination position (c_rarg3)
2685                                            Register length,
2686                                            Register temp,
2687                                            Label& L_failed) {
2688   BLOCK_COMMENT("arraycopy_range_checks:");
2689 
2690   //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2691   __ movl(temp, length);
2692   __ addl(temp, src_pos);             // src_pos + length
2693   __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2694   __ jcc(Assembler::above, L_failed);
2695 
2696   //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2697   __ movl(temp, length);
2698   __ addl(temp, dst_pos);             // dst_pos + length
2699   __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2700   __ jcc(Assembler::above, L_failed);
2701 
2702   // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2703   // Move with sign extension can be used since they are positive.
2704   __ movslq(src_pos, src_pos);
2705   __ movslq(dst_pos, dst_pos);
2706 
2707   BLOCK_COMMENT("arraycopy_range_checks done");
2708 }
2709 
2710 
2711 //  Generate generic array copy stubs
2712 //
2713 //  Input:
2714 //    c_rarg0    -  src oop
2715 //    c_rarg1    -  src_pos (32-bits)
2716 //    c_rarg2    -  dst oop
2717 //    c_rarg3    -  dst_pos (32-bits)
2718 // not Win64
2719 //    c_rarg4    -  element count (32-bits)
2720 // Win64
2721 //    rsp+40     -  element count (32-bits)
2722 //
2723 //  Output:
2724 //    rax ==  0  -  success
2725 //    rax == -1^K - failure, where K is partial transfer count
2726 //
2727 address StubGenerator::generate_generic_copy(const char *name,
2728                                              address byte_copy_entry, address short_copy_entry,
2729                                              address int_copy_entry, address oop_copy_entry,
2730                                              address long_copy_entry, address checkcast_copy_entry) {
2731 
2732   Label L_failed, L_failed_0, L_objArray;
2733   Label L_copy_shorts, L_copy_ints, L_copy_longs;
2734 
2735   // Input registers
2736   const Register src        = c_rarg0;  // source array oop
2737   const Register src_pos    = c_rarg1;  // source position
2738   const Register dst        = c_rarg2;  // destination array oop
2739   const Register dst_pos    = c_rarg3;  // destination position
2740 #ifndef _WIN64
2741   const Register length     = c_rarg4;
2742   const Register rklass_tmp = r9;  // load_klass
2743 #else
2744   const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
2745   const Register rklass_tmp = rdi;  // load_klass
2746 #endif
2747 
2748   { int modulus = CodeEntryAlignment;
2749     int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2750     int advance = target - (__ offset() % modulus);
2751     if (advance < 0)  advance += modulus;
2752     if (advance > 0)  __ nop(advance);
2753   }
2754   StubCodeMark mark(this, "StubRoutines", name);
2755 
2756   // Short-hop target to L_failed.  Makes for denser prologue code.
2757   __ BIND(L_failed_0);
2758   __ jmp(L_failed);
2759   assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2760 
2761   __ align(CodeEntryAlignment);
2762   address start = __ pc();
2763 
2764   __ enter(); // required for proper stackwalking of RuntimeStub frame
2765 
2766 #ifdef _WIN64
2767   __ push(rklass_tmp); // rdi is callee-save on Windows
2768 #endif
2769 
2770   // bump this on entry, not on exit:
2771   INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
2772 
2773   //-----------------------------------------------------------------------
2774   // Assembler stub will be used for this call to arraycopy
2775   // if the following conditions are met:
2776   //
2777   // (1) src and dst must not be null.
2778   // (2) src_pos must not be negative.
2779   // (3) dst_pos must not be negative.
2780   // (4) length  must not be negative.
2781   // (5) src klass and dst klass should be the same and not null.
2782   // (6) src and dst should be arrays.
2783   // (7) src_pos + length must not exceed length of src.
2784   // (8) dst_pos + length must not exceed length of dst.
2785   //
2786 
2787   //  if (src == nullptr) return -1;
2788   __ testptr(src, src);         // src oop
2789   size_t j1off = __ offset();
2790   __ jccb(Assembler::zero, L_failed_0);
2791 
2792   //  if (src_pos < 0) return -1;
2793   __ testl(src_pos, src_pos); // src_pos (32-bits)
2794   __ jccb(Assembler::negative, L_failed_0);
2795 
2796   //  if (dst == nullptr) return -1;
2797   __ testptr(dst, dst);         // dst oop
2798   __ jccb(Assembler::zero, L_failed_0);
2799 
2800   //  if (dst_pos < 0) return -1;
2801   __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2802   size_t j4off = __ offset();
2803   __ jccb(Assembler::negative, L_failed_0);
2804 
2805   // The first four tests are very dense code,
2806   // but not quite dense enough to put four
2807   // jumps in a 16-byte instruction fetch buffer.
2808   // That's good, because some branch predicters
2809   // do not like jumps so close together.
2810   // Make sure of this.
2811   guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2812 
2813   // registers used as temp
2814   const Register r11_length    = r11; // elements count to copy
2815   const Register r10_src_klass = r10; // array klass
2816 
2817   //  if (length < 0) return -1;
2818   __ movl(r11_length, length);        // length (elements count, 32-bits value)
2819   __ testl(r11_length, r11_length);
2820   __ jccb(Assembler::negative, L_failed_0);
2821 
2822   __ load_klass(r10_src_klass, src, rklass_tmp);
2823 #ifdef ASSERT
2824   //  assert(src->klass() != nullptr);
2825   {
2826     BLOCK_COMMENT("assert klasses not null {");
2827     Label L1, L2;
2828     __ testptr(r10_src_klass, r10_src_klass);
2829     __ jcc(Assembler::notZero, L2);   // it is broken if klass is null
2830     __ bind(L1);
2831     __ stop("broken null klass");
2832     __ bind(L2);
2833     __ load_klass(rax, dst, rklass_tmp);
2834     __ cmpq(rax, 0);
2835     __ jcc(Assembler::equal, L1);     // this would be broken also
2836     BLOCK_COMMENT("} assert klasses not null done");
2837   }
2838 #endif
2839 
2840   // Load layout helper (32-bits)
2841   //
2842   //  |array_tag|     | header_size | element_type |     |log2_element_size|
2843   // 32        30    24            16              8     2                 0
2844   //
2845   //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2846   //
2847 
2848   const int lh_offset = in_bytes(Klass::layout_helper_offset());
2849 
2850   // Handle objArrays completely differently...
2851   const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2852   __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2853   __ jcc(Assembler::equal, L_objArray);
2854 
2855   //  if (src->klass() != dst->klass()) return -1;
2856   __ load_klass(rax, dst, rklass_tmp);
2857   __ cmpq(r10_src_klass, rax);
2858   __ jcc(Assembler::notEqual, L_failed);
2859 
2860   const Register rax_lh = rax;  // layout helper
2861   __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2862 
2863   //  if (!src->is_Array()) return -1;
2864   __ cmpl(rax_lh, Klass::_lh_neutral_value);
2865   __ jcc(Assembler::greaterEqual, L_failed);
2866 
2867   // At this point, it is known to be a typeArray (array_tag 0x3).
2868 #ifdef ASSERT
2869   {
2870     BLOCK_COMMENT("assert primitive array {");
2871     Label L;
2872     __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2873     __ jcc(Assembler::greaterEqual, L);
2874     __ stop("must be a primitive array");
2875     __ bind(L);
2876     BLOCK_COMMENT("} assert primitive array done");
2877   }
2878 #endif
2879 
2880   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2881                          r10, L_failed);
2882 
2883   // TypeArrayKlass
2884   //
2885   // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2886   // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2887   //
2888 
2889   const Register r10_offset = r10;    // array offset
2890   const Register rax_elsize = rax_lh; // element size
2891 
2892   __ movl(r10_offset, rax_lh);
2893   __ shrl(r10_offset, Klass::_lh_header_size_shift);
2894   __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2895   __ addptr(src, r10_offset);           // src array offset
2896   __ addptr(dst, r10_offset);           // dst array offset
2897   BLOCK_COMMENT("choose copy loop based on element size");
2898   __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2899 
2900 #ifdef _WIN64
2901   __ pop(rklass_tmp); // Restore callee-save rdi
2902 #endif
2903 
2904   // next registers should be set before the jump to corresponding stub
2905   const Register from     = c_rarg0;  // source array address
2906   const Register to       = c_rarg1;  // destination array address
2907   const Register count    = c_rarg2;  // elements count
2908 
2909   // 'from', 'to', 'count' registers should be set in such order
2910   // since they are the same as 'src', 'src_pos', 'dst'.
2911 
2912   __ cmpl(rax_elsize, 0);
2913   __ jccb(Assembler::notEqual, L_copy_shorts);
2914   __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2915   __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2916   __ movl2ptr(count, r11_length); // length
2917   __ jump(RuntimeAddress(byte_copy_entry));
2918 
2919 __ BIND(L_copy_shorts);
2920   __ cmpl(rax_elsize, LogBytesPerShort);
2921   __ jccb(Assembler::notEqual, L_copy_ints);
2922   __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2923   __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2924   __ movl2ptr(count, r11_length); // length
2925   __ jump(RuntimeAddress(short_copy_entry));
2926 
2927 __ BIND(L_copy_ints);
2928   __ cmpl(rax_elsize, LogBytesPerInt);
2929   __ jccb(Assembler::notEqual, L_copy_longs);
2930   __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2931   __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2932   __ movl2ptr(count, r11_length); // length
2933   __ jump(RuntimeAddress(int_copy_entry));
2934 
2935 __ BIND(L_copy_longs);
2936 #ifdef ASSERT
2937   {
2938     BLOCK_COMMENT("assert long copy {");
2939     Label L;
2940     __ cmpl(rax_elsize, LogBytesPerLong);
2941     __ jcc(Assembler::equal, L);
2942     __ stop("must be long copy, but elsize is wrong");
2943     __ bind(L);
2944     BLOCK_COMMENT("} assert long copy done");
2945   }
2946 #endif
2947   __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2948   __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2949   __ movl2ptr(count, r11_length); // length
2950   __ jump(RuntimeAddress(long_copy_entry));
2951 
2952   // ObjArrayKlass
2953 __ BIND(L_objArray);
2954   // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2955 
2956   Label L_plain_copy, L_checkcast_copy;
2957   //  test array classes for subtyping
2958   __ load_klass(rax, dst, rklass_tmp);
2959   __ cmpq(r10_src_klass, rax); // usual case is exact equality
2960   __ jcc(Assembler::notEqual, L_checkcast_copy);
2961 
2962   // Identically typed arrays can be copied without element-wise checks.
2963   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2964                          r10, L_failed);
2965 
2966   __ lea(from, Address(src, src_pos, TIMES_OOP,
2967                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2968   __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2969                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2970   __ movl2ptr(count, r11_length); // length
2971 __ BIND(L_plain_copy);
2972 #ifdef _WIN64
2973   __ pop(rklass_tmp); // Restore callee-save rdi
2974 #endif
2975   __ jump(RuntimeAddress(oop_copy_entry));
2976 
2977 __ BIND(L_checkcast_copy);
2978   // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2979   {
2980     // Before looking at dst.length, make sure dst is also an objArray.
2981     __ cmpl(Address(rax, lh_offset), objArray_lh);
2982     __ jcc(Assembler::notEqual, L_failed);
2983 
2984     // It is safe to examine both src.length and dst.length.
2985     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2986                            rax, L_failed);
2987 
2988     const Register r11_dst_klass = r11;
2989     __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
2990 
2991     // Marshal the base address arguments now, freeing registers.
2992     __ lea(from, Address(src, src_pos, TIMES_OOP,
2993                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2994     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2995                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2996     __ movl(count, length);           // length (reloaded)
2997     Register sco_temp = c_rarg3;      // this register is free now
2998     assert_different_registers(from, to, count, sco_temp,
2999                                r11_dst_klass, r10_src_klass);
3000     assert_clean_int(count, sco_temp);
3001 
3002     // Generate the type check.
3003     const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3004     __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3005     assert_clean_int(sco_temp, rax);
3006     generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3007 
3008     // Fetch destination element klass from the ObjArrayKlass header.
3009     int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3010     __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3011     __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3012     assert_clean_int(sco_temp, rax);
3013 
3014 #ifdef _WIN64
3015     __ pop(rklass_tmp); // Restore callee-save rdi
3016 #endif
3017 
3018     // the checkcast_copy loop needs two extra arguments:
3019     assert(c_rarg3 == sco_temp, "#3 already in place");
3020     // Set up arguments for checkcast_copy_entry.
3021     setup_arg_regs_using_thread(4);
3022     __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3023     __ jump(RuntimeAddress(checkcast_copy_entry));
3024   }
3025 
3026 __ BIND(L_failed);
3027 #ifdef _WIN64
3028   __ pop(rklass_tmp); // Restore callee-save rdi
3029 #endif
3030   __ xorptr(rax, rax);
3031   __ notptr(rax); // return -1
3032   __ leave();   // required for proper stackwalking of RuntimeStub frame
3033   __ ret(0);
3034 
3035   return start;
3036 }
3037 
3038 #undef __