1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/macroAssembler.hpp"
  26 #include "gc/shared/barrierSet.hpp"
  27 #include "gc/shared/barrierSetAssembler.hpp"
  28 #include "oops/objArrayKlass.hpp"
  29 #include "runtime/sharedRuntime.hpp"
  30 #include "runtime/stubRoutines.hpp"
  31 #include "stubGenerator_x86_64.hpp"
  32 #ifdef COMPILER2
  33 #include "opto/c2_globals.hpp"
  34 #endif
  35 #if INCLUDE_JVMCI
  36 #include "jvmci/jvmci_globals.hpp"
  37 #endif
  38 
  39 #define __ _masm->
  40 
  41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #else
  46 #define BLOCK_COMMENT(str) __ block_comment(str)
  47 #endif // PRODUCT
  48 
  49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  50 
  51 #ifdef PRODUCT
  52 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
  53 #else
  54 #define INC_COUNTER_NP(counter, rscratch) \
  55 BLOCK_COMMENT("inc_counter " #counter); \
  56 inc_counter_np(_masm, counter, rscratch);
  57 
  58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
  59   __ incrementl(ExternalAddress((address)&counter), rscratch);
  60 }
  61 
  62 #if COMPILER2_OR_JVMCI
  63 static uint& get_profile_ctr(int shift) {
  64   if (shift == 0) {
  65     return SharedRuntime::_jbyte_array_copy_ctr;
  66   } else if (shift == 1) {
  67     return SharedRuntime::_jshort_array_copy_ctr;
  68   } else if (shift == 2) {
  69     return SharedRuntime::_jint_array_copy_ctr;
  70   } else {
  71     assert(shift == 3, "");
  72     return SharedRuntime::_jlong_array_copy_ctr;
  73   }
  74 }
  75 #endif // COMPILER2_OR_JVMCI
  76 #endif // !PRODUCT
  77 
  78 void StubGenerator::generate_arraycopy_stubs() {
  79   // Some copy stubs publish a normal entry and then a 2nd 'fallback'
  80   // entry immediately following their stack push. This can be used
  81   // as a post-push branch target for compatible stubs when they
  82   // identify a special case that can be handled by the fallback
  83   // stub e.g a disjoint copy stub may be use as a special case
  84   // fallback for its compatible conjoint copy stub.
  85   //
  86   // A no push entry is always returned in the following local and
  87   // then published by assigning to the appropriate entry field in
  88   // class StubRoutines. The entry value is then passed to the
  89   // generator for the compatible stub. That means the entry must be
  90   // listed when saving to/restoring from the AOT cache, ensuring
  91   // that the inter-stub jumps are noted at AOT-cache save and
  92   // relocated at AOT cache load.
  93   address nopush_entry;
  94 
  95   StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(&nopush_entry);
  96   // disjoint nopush entry is needed by conjoint copy
  97   StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
  98   StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
  99   // conjoint nopush entry is needed by generic/unsafe copy
 100   StubRoutines::_jbyte_arraycopy_nopush    = nopush_entry;
 101 
 102   StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
 103   // disjoint nopush entry is needed by conjoint copy
 104   StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
 105   StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 106   // conjoint nopush entry is needed by generic/unsafe copy
 107   StubRoutines::_jshort_arraycopy_nopush   = nopush_entry;
 108 
 109   StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 110   // disjoint nopush entry is needed by conjoint copy
 111   StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
 112   StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 113   // conjoint nopush entry is needed by generic/unsafe copy
 114   StubRoutines::_jint_arraycopy_nopush     = nopush_entry;
 115 
 116   StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
 117   // disjoint nopush entry is needed by conjoint copy
 118   StubRoutines::_jlong_disjoint_arraycopy_nopush  = nopush_entry;
 119   StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 120   // conjoint nopush entry is needed by generic/unsafe copy
 121   StubRoutines::_jlong_arraycopy_nopush    = nopush_entry;
 122 
 123   if (UseCompressedOops) {
 124     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
 125     // disjoint nopush entry is needed by conjoint copy
 126     StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
 127     StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
 128     // conjoint nopush entry is needed by generic/unsafe copy
 129     StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
 130     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 131     // disjoint nopush entry is needed by conjoint copy
 132     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
 133     // note that we don't need a returned nopush entry because the
 134     // generic/unsafe copy does not cater for uninit arrays.
 135     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 136   } else {
 137     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
 138     // disjoint nopush entry is needed by conjoint copy
 139     StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
 140     StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
 141     // conjoint nopush entry is needed by generic/unsafe copy
 142     StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
 143     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 144     // disjoint nopush entry is needed by conjoint copy
 145     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
 146     // note that we don't need a returned nopush entry because the
 147     // generic/unsafe copy does not cater for uninit arrays.
 148     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 149   }
 150 
 151   StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 152   // checkcast nopush entry is needed by generic copy
 153   StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 154   // note that we don't need a returned nopush entry because the
 155   // generic copy does not cater for uninit arrays.
 156   StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 157 
 158   StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 159                                                             StubRoutines::_jshort_arraycopy_nopush,
 160                                                             StubRoutines::_jint_arraycopy_nopush,
 161                                                             StubRoutines::_jlong_arraycopy_nopush);
 162   StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 163                                                              StubRoutines::_jshort_arraycopy_nopush,
 164                                                              StubRoutines::_jint_arraycopy_nopush,
 165                                                              StubRoutines::_oop_arraycopy_nopush,
 166                                                              StubRoutines::_jlong_arraycopy_nopush,
 167                                                              StubRoutines::_checkcast_arraycopy_nopush);
 168 
 169   StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 170   StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 171   StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 172   StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 173   StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 174   StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 175 
 176   StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
 177 
 178   // We don't generate specialized code for HeapWord-aligned source
 179   // arrays, so just use the code we've already generated
 180   StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
 181   StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
 182 
 183   StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
 184   StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
 185 
 186   StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
 187   StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
 188 
 189   StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
 190   StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
 191 
 192   StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
 193   StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
 194 
 195   StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
 196   StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
 197 }
 198 
 199 
 200 // Verify that a register contains clean 32-bits positive value
 201 // (high 32-bits are 0) so it could be used in 64-bits shifts.
 202 //
 203 //  Input:
 204 //    Rint  -  32-bits value
 205 //    Rtmp  -  scratch
 206 //
 207 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
 208 #ifdef ASSERT
 209   Label L;
 210   assert_different_registers(Rtmp, Rint);
 211   __ movslq(Rtmp, Rint);
 212   __ cmpq(Rtmp, Rint);
 213   __ jcc(Assembler::equal, L);
 214   __ stop("high 32-bits of int value are not 0");
 215   __ bind(L);
 216 #endif
 217 }
 218 
 219 
 220 //  Generate overlap test for array copy stubs
 221 //
 222 //  Input:
 223 //     c_rarg0 - from
 224 //     c_rarg1 - to
 225 //     c_rarg2 - element count
 226 //
 227 //  Output:
 228 //     rax   - &from[element count - 1]
 229 //
 230 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
 231   const Register from     = c_rarg0;
 232   const Register to       = c_rarg1;
 233   const Register count    = c_rarg2;
 234   const Register end_from = rax;
 235 
 236   __ cmpptr(to, from);
 237   __ lea(end_from, Address(from, count, sf, 0));
 238   if (NOLp == nullptr) {
 239     RuntimeAddress no_overlap(no_overlap_target);
 240     __ jump_cc(Assembler::belowEqual, no_overlap);
 241     __ cmpptr(to, end_from);
 242     __ jump_cc(Assembler::aboveEqual, no_overlap);
 243   } else {
 244     __ jcc(Assembler::belowEqual, (*NOLp));
 245     __ cmpptr(to, end_from);
 246     __ jcc(Assembler::aboveEqual, (*NOLp));
 247   }
 248 }
 249 
 250 
 251 // Copy big chunks forward
 252 //
 253 // Inputs:
 254 //   end_from     - source arrays end address
 255 //   end_to       - destination array end address
 256 //   qword_count  - 64-bits element count, negative
 257 //   tmp1         - scratch
 258 //   L_copy_bytes - entry label
 259 //   L_copy_8_bytes  - exit  label
 260 //
 261 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
 262                                        Register qword_count, Register tmp1,
 263                                        Register tmp2, Label& L_copy_bytes,
 264                                        Label& L_copy_8_bytes, DecoratorSet decorators,
 265                                        BasicType type) {
 266   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 267   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 268   Label L_loop;
 269   __ align(OptoLoopAlignment);
 270   if (UseUnalignedLoadStores) {
 271     Label L_end;
 272     __ BIND(L_loop);
 273     if (UseAVX >= 2) {
 274       bs->copy_load_at(_masm, decorators, type, 32,
 275                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 276                        tmp1, xmm1);
 277       bs->copy_store_at(_masm, decorators, type, 32,
 278                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 279                         tmp1, tmp2, xmm1);
 280 
 281       bs->copy_load_at(_masm, decorators, type, 32,
 282                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 283                        tmp1, xmm1);
 284       bs->copy_store_at(_masm, decorators, type, 32,
 285                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 286                         tmp1, tmp2, xmm1);
 287     } else {
 288       bs->copy_load_at(_masm, decorators, type, 16,
 289                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 290                        tmp1, xmm1);
 291       bs->copy_store_at(_masm, decorators, type, 16,
 292                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 293                         tmp1, tmp2, xmm1);
 294       bs->copy_load_at(_masm, decorators, type, 16,
 295                        xmm0, Address(end_from, qword_count, Address::times_8, -40),
 296                        tmp1, xmm1);
 297       bs->copy_store_at(_masm, decorators, type, 16,
 298                         Address(end_to, qword_count, Address::times_8, -40), xmm0,
 299                         tmp1, tmp2, xmm1);
 300       bs->copy_load_at(_masm, decorators, type, 16,
 301                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 302                        tmp1, xmm1);
 303       bs->copy_store_at(_masm, decorators, type, 16,
 304                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 305                         tmp1, tmp2, xmm1);
 306       bs->copy_load_at(_masm, decorators, type, 16,
 307                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 308                        tmp1, xmm1);
 309       bs->copy_store_at(_masm, decorators, type, 16,
 310                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 311                         tmp1, tmp2, xmm1);
 312     }
 313 
 314     __ BIND(L_copy_bytes);
 315     __ addptr(qword_count, 8);
 316     __ jcc(Assembler::lessEqual, L_loop);
 317     __ subptr(qword_count, 4);  // sub(8) and add(4)
 318     __ jcc(Assembler::greater, L_end);
 319     // Copy trailing 32 bytes
 320     if (UseAVX >= 2) {
 321       bs->copy_load_at(_masm, decorators, type, 32,
 322                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 323                        tmp1, xmm1);
 324       bs->copy_store_at(_masm, decorators, type, 32,
 325                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 326                         tmp1, tmp2, xmm1);
 327     } else {
 328       bs->copy_load_at(_masm, decorators, type, 16,
 329                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 330                        tmp1, xmm1);
 331       bs->copy_store_at(_masm, decorators, type, 16,
 332                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 333                         tmp1, tmp2, xmm1);
 334       bs->copy_load_at(_masm, decorators, type, 16,
 335                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 336                        tmp1, xmm1);
 337       bs->copy_store_at(_masm, decorators, type, 16,
 338                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 339                         tmp1, tmp2, xmm1);
 340     }
 341     __ addptr(qword_count, 4);
 342     __ BIND(L_end);
 343   } else {
 344     // Copy 32-bytes per iteration
 345     __ BIND(L_loop);
 346     bs->copy_load_at(_masm, decorators, type, 8,
 347                      tmp1, Address(end_from, qword_count, Address::times_8, -24),
 348                      tmp2);
 349     bs->copy_store_at(_masm, decorators, type, 8,
 350                       Address(end_to, qword_count, Address::times_8, -24), tmp1,
 351                       tmp2);
 352     bs->copy_load_at(_masm, decorators, type, 8,
 353                      tmp1, Address(end_from, qword_count, Address::times_8, -16),
 354                      tmp2);
 355     bs->copy_store_at(_masm, decorators, type, 8,
 356                       Address(end_to, qword_count, Address::times_8, -16), tmp1,
 357                       tmp2);
 358     bs->copy_load_at(_masm, decorators, type, 8,
 359                      tmp1, Address(end_from, qword_count, Address::times_8, -8),
 360                      tmp2);
 361     bs->copy_store_at(_masm, decorators, type, 8,
 362                       Address(end_to, qword_count, Address::times_8, -8), tmp1,
 363                       tmp2);
 364     bs->copy_load_at(_masm, decorators, type, 8,
 365                      tmp1, Address(end_from, qword_count, Address::times_8, 0),
 366                      tmp2);
 367     bs->copy_store_at(_masm, decorators, type, 8,
 368                       Address(end_to, qword_count, Address::times_8, 0), tmp1,
 369                       tmp2);
 370 
 371     __ BIND(L_copy_bytes);
 372     __ addptr(qword_count, 4);
 373     __ jcc(Assembler::lessEqual, L_loop);
 374   }
 375   __ subptr(qword_count, 4);
 376   __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
 377 }
 378 
 379 
 380 // Copy big chunks backward
 381 //
 382 // Inputs:
 383 //   from         - source arrays address
 384 //   dest         - destination array address
 385 //   qword_count  - 64-bits element count
 386 //   tmp1         - scratch
 387 //   L_copy_bytes - entry label
 388 //   L_copy_8_bytes  - exit  label
 389 //
 390 void StubGenerator::copy_bytes_backward(Register from, Register dest,
 391                                         Register qword_count, Register tmp1,
 392                                         Register tmp2, Label& L_copy_bytes,
 393                                         Label& L_copy_8_bytes, DecoratorSet decorators,
 394                                         BasicType type) {
 395   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 396   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 397   Label L_loop;
 398   __ align(OptoLoopAlignment);
 399   if (UseUnalignedLoadStores) {
 400     Label L_end;
 401     __ BIND(L_loop);
 402     if (UseAVX >= 2) {
 403       bs->copy_load_at(_masm, decorators, type, 32,
 404                        xmm0, Address(from, qword_count, Address::times_8, 32),
 405                        tmp1, xmm1);
 406       bs->copy_store_at(_masm, decorators, type, 32,
 407                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 408                         tmp1, tmp2, xmm1);
 409       bs->copy_load_at(_masm, decorators, type, 32,
 410                        xmm0, Address(from, qword_count, Address::times_8, 0),
 411                        tmp1, xmm1);
 412       bs->copy_store_at(_masm, decorators, type, 32,
 413                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 414                         tmp1, tmp2, xmm1);
 415     } else {
 416       bs->copy_load_at(_masm, decorators, type, 16,
 417                        xmm0, Address(from, qword_count, Address::times_8, 48),
 418                        tmp1, xmm1);
 419       bs->copy_store_at(_masm, decorators, type, 16,
 420                         Address(dest, qword_count, Address::times_8, 48), xmm0,
 421                         tmp1, tmp2, xmm1);
 422       bs->copy_load_at(_masm, decorators, type, 16,
 423                        xmm0, Address(from, qword_count, Address::times_8, 32),
 424                        tmp1, xmm1);
 425       bs->copy_store_at(_masm, decorators, type, 16,
 426                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 427                         tmp1, tmp2, xmm1);
 428       bs->copy_load_at(_masm, decorators, type, 16,
 429                        xmm0, Address(from, qword_count, Address::times_8, 16),
 430                        tmp1, xmm1);
 431       bs->copy_store_at(_masm, decorators, type, 16,
 432                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 433                         tmp1, tmp2, xmm1);
 434       bs->copy_load_at(_masm, decorators, type, 16,
 435                        xmm0, Address(from, qword_count, Address::times_8, 0),
 436                        tmp1, xmm1);
 437       bs->copy_store_at(_masm, decorators, type, 16,
 438                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 439                         tmp1, tmp2, xmm1);
 440     }
 441 
 442     __ BIND(L_copy_bytes);
 443     __ subptr(qword_count, 8);
 444     __ jcc(Assembler::greaterEqual, L_loop);
 445 
 446     __ addptr(qword_count, 4);  // add(8) and sub(4)
 447     __ jcc(Assembler::less, L_end);
 448     // Copy trailing 32 bytes
 449     if (UseAVX >= 2) {
 450       bs->copy_load_at(_masm, decorators, type, 32,
 451                        xmm0, Address(from, qword_count, Address::times_8, 0),
 452                        tmp1, xmm1);
 453       bs->copy_store_at(_masm, decorators, type, 32,
 454                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 455                         tmp1, tmp2, xmm1);
 456     } else {
 457       bs->copy_load_at(_masm, decorators, type, 16,
 458                        xmm0, Address(from, qword_count, Address::times_8, 16),
 459                        tmp1, xmm1);
 460       bs->copy_store_at(_masm, decorators, type, 16,
 461                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 462                         tmp1, tmp2, xmm1);
 463       bs->copy_load_at(_masm, decorators, type, 16,
 464                        xmm0, Address(from, qword_count, Address::times_8, 0),
 465                        tmp1, xmm1);
 466       bs->copy_store_at(_masm, decorators, type, 16,
 467                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 468                         tmp1, tmp2, xmm1);
 469     }
 470     __ subptr(qword_count, 4);
 471     __ BIND(L_end);
 472   } else {
 473     // Copy 32-bytes per iteration
 474     __ BIND(L_loop);
 475     bs->copy_load_at(_masm, decorators, type, 8,
 476                      tmp1, Address(from, qword_count, Address::times_8, 24),
 477                      tmp2);
 478     bs->copy_store_at(_masm, decorators, type, 8,
 479                       Address(dest, qword_count, Address::times_8, 24), tmp1,
 480                       tmp2);
 481     bs->copy_load_at(_masm, decorators, type, 8,
 482                      tmp1, Address(from, qword_count, Address::times_8, 16),
 483                      tmp2);
 484     bs->copy_store_at(_masm, decorators, type, 8,
 485                       Address(dest, qword_count, Address::times_8, 16), tmp1,
 486                       tmp2);
 487     bs->copy_load_at(_masm, decorators, type, 8,
 488                      tmp1, Address(from, qword_count, Address::times_8, 8),
 489                      tmp2);
 490     bs->copy_store_at(_masm, decorators, type, 8,
 491                       Address(dest, qword_count, Address::times_8, 8), tmp1,
 492                       tmp2);
 493     bs->copy_load_at(_masm, decorators, type, 8,
 494                      tmp1, Address(from, qword_count, Address::times_8, 0),
 495                      tmp2);
 496     bs->copy_store_at(_masm, decorators, type, 8,
 497                       Address(dest, qword_count, Address::times_8, 0), tmp1,
 498                       tmp2);
 499 
 500     __ BIND(L_copy_bytes);
 501     __ subptr(qword_count, 4);
 502     __ jcc(Assembler::greaterEqual, L_loop);
 503   }
 504   __ addptr(qword_count, 4);
 505   __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
 506 }
 507 
 508 #if COMPILER2_OR_JVMCI
 509 
 510 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
 511 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 512 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 513 //   default configuration.
 514 // - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 515 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 516 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 517 //   better performance for disjoint copies. For conjoint/backward copy vector based
 518 //   copy performs better.
 519 // - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
 520 //   64 byte vector registers (ZMMs).
 521 
 522 // Inputs:
 523 //   c_rarg0   - source array address
 524 //   c_rarg1   - destination array address
 525 //   c_rarg2   - element count, treated as ssize_t, can be zero
 526 //
 527 //
 528 // Side Effects:
 529 //   disjoint_copy_avx3_masked is set to the no-overlap entry point
 530 //   used by generate_conjoint_[byte/int/short/long]_copy().
 531 //
 532 address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry) {
 533   // aligned is always false -- x86_64 always uses the unaligned code
 534   const bool aligned = false;
 535   int shift;
 536   bool is_oop;
 537   bool dest_uninitialized;
 538 
 539   switch (stub_id) {
 540   case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 541     shift = 0;
 542     is_oop = false;
 543     dest_uninitialized = false;
 544     break;
 545   case StubId::stubgen_jshort_disjoint_arraycopy_id:
 546     shift = 1;
 547     is_oop = false;
 548     dest_uninitialized = false;
 549     break;
 550   case StubId::stubgen_jint_disjoint_arraycopy_id:
 551     shift = 2;
 552     is_oop = false;
 553     dest_uninitialized = false;
 554     break;
 555   case StubId::stubgen_jlong_disjoint_arraycopy_id:
 556     shift = 3;
 557     is_oop = false;
 558     dest_uninitialized = false;
 559     break;
 560   case StubId::stubgen_oop_disjoint_arraycopy_id:
 561     shift = (UseCompressedOops ? 2 : 3);
 562     is_oop = true;
 563     dest_uninitialized = false;
 564     break;
 565   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 566     shift = (UseCompressedOops ? 2 : 3);
 567     is_oop = true;
 568     dest_uninitialized = true;
 569     break;
 570   default:
 571     ShouldNotReachHere();
 572   }
 573 
 574   __ align(CodeEntryAlignment);
 575   StubCodeMark mark(this, stub_id);
 576   address start = __ pc();
 577 
 578   bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
 579   const int large_threshold = 2621440; // 2.5 MB
 580   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 581   Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 582   Label L_copy_large, L_finish;
 583   const Register from        = rdi;  // source array address
 584   const Register to          = rsi;  // destination array address
 585   const Register count       = rdx;  // elements count
 586   const Register temp1       = r8;
 587   const Register temp2       = r11;
 588   const Register temp3       = rax;
 589   const Register temp4       = rcx;
 590   // End pointers are inclusive, and if count is not zero they point
 591   // to the last unit copied:  end_to[0] := end_from[0]
 592 
 593   __ enter(); // required for proper stackwalking of RuntimeStub frame
 594   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 595 
 596   if (entry != nullptr) {
 597     *entry = __ pc();
 598      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 599     BLOCK_COMMENT("Entry:");
 600   }
 601 
 602   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 603   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 604 
 605   setup_argument_regs(type);
 606 
 607   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 608   if (dest_uninitialized) {
 609     decorators |= IS_DEST_UNINITIALIZED;
 610   }
 611   if (aligned) {
 612     decorators |= ARRAYCOPY_ALIGNED;
 613   }
 614   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 615   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 616 
 617   {
 618     // Type(shift)           byte(0), short(1), int(2),   long(3)
 619     int loop_size[]        = { 192,     96,       48,      24};
 620     int threshold[]        = { 4096,    2048,     1024,    512};
 621 
 622     // UnsafeMemoryAccess page error: continue after unsafe access
 623     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 624     // 'from', 'to' and 'count' are now valid
 625 
 626     // temp1 holds remaining count and temp4 holds running count used to compute
 627     // next address offset for start of to/from addresses (temp4 * scale).
 628     __ mov64(temp4, 0);
 629     __ movq(temp1, count);
 630 
 631     // Zero length check.
 632     __ BIND(L_tail);
 633     __ cmpq(temp1, 0);
 634     __ jcc(Assembler::lessEqual, L_exit);
 635 
 636     // Special cases using 32 byte [masked] vector copy operations.
 637     arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 638                                  temp4, temp3, use64byteVector, L_entry, L_exit);
 639 
 640     // PRE-MAIN-POST loop for aligned copy.
 641     __ BIND(L_entry);
 642 
 643     if (MaxVectorSize == 64) {
 644       __ movq(temp2, temp1);
 645       __ shlq(temp2, shift);
 646       __ cmpq(temp2, large_threshold);
 647       __ jcc(Assembler::greaterEqual, L_copy_large);
 648     }
 649     if (CopyAVX3Threshold != 0) {
 650       __ cmpq(count, threshold[shift]);
 651       if (MaxVectorSize == 64) {
 652         // Copy using 64 byte vectors.
 653         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 654       } else {
 655         assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
 656         // REP MOVS offer a faster copy path.
 657         __ jcc(Assembler::greaterEqual, L_repmovs);
 658       }
 659     }
 660 
 661     if ((MaxVectorSize < 64)  || (CopyAVX3Threshold != 0)) {
 662       // Partial copy to make dst address 32 byte aligned.
 663       __ movq(temp2, to);
 664       __ andq(temp2, 31);
 665       __ jcc(Assembler::equal, L_main_pre_loop);
 666 
 667       __ negptr(temp2);
 668       __ addq(temp2, 32);
 669       if (shift) {
 670         __ shrq(temp2, shift);
 671       }
 672       __ movq(temp3, temp2);
 673       copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
 674       __ movq(temp4, temp2);
 675       __ movq(temp1, count);
 676       __ subq(temp1, temp2);
 677 
 678       __ cmpq(temp1, loop_size[shift]);
 679       __ jcc(Assembler::less, L_tail);
 680 
 681       __ BIND(L_main_pre_loop);
 682       __ subq(temp1, loop_size[shift]);
 683 
 684       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 685       __ align32();
 686       __ BIND(L_main_loop);
 687          copy64_avx(to, from, temp4, xmm1, false, shift, 0);
 688          copy64_avx(to, from, temp4, xmm1, false, shift, 64);
 689          copy64_avx(to, from, temp4, xmm1, false, shift, 128);
 690          __ addptr(temp4, loop_size[shift]);
 691          __ subq(temp1, loop_size[shift]);
 692          __ jcc(Assembler::greater, L_main_loop);
 693 
 694       __ addq(temp1, loop_size[shift]);
 695 
 696       // Tail loop.
 697       __ jmp(L_tail);
 698 
 699       __ BIND(L_repmovs);
 700         __ movq(temp2, temp1);
 701         // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
 702         __ movq(temp3, to);
 703         __ movq(to,  from);
 704         __ movq(from, temp3);
 705         // Save to/from for restoration post rep_mov.
 706         __ movq(temp1, to);
 707         __ movq(temp3, from);
 708         if(shift < 3) {
 709           __ shrq(temp2, 3-shift);     // quad word count
 710         }
 711         __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
 712         __ rep_mov();
 713         __ shlq(temp2, 3);             // convert quad words into byte count.
 714         if(shift) {
 715           __ shrq(temp2, shift);       // type specific count.
 716         }
 717         // Restore original addresses in to/from.
 718         __ movq(to, temp3);
 719         __ movq(from, temp1);
 720         __ movq(temp4, temp2);
 721         __ movq(temp1, count);
 722         __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
 723         __ jmp(L_tail);
 724     }
 725 
 726     if (MaxVectorSize > 32) {
 727       __ BIND(L_pre_main_post_64);
 728       // Partial copy to make dst address 64 byte aligned.
 729       __ movq(temp2, to);
 730       __ andq(temp2, 63);
 731       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 732 
 733       __ negptr(temp2);
 734       __ addq(temp2, 64);
 735       if (shift) {
 736         __ shrq(temp2, shift);
 737       }
 738       __ movq(temp3, temp2);
 739       copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
 740       __ movq(temp4, temp2);
 741       __ movq(temp1, count);
 742       __ subq(temp1, temp2);
 743 
 744       __ cmpq(temp1, loop_size[shift]);
 745       __ jcc(Assembler::less, L_tail64);
 746 
 747       __ BIND(L_main_pre_loop_64bytes);
 748       __ subq(temp1, loop_size[shift]);
 749 
 750       // Main loop with aligned copy block size of 192 bytes at
 751       // 64 byte copy granularity.
 752       __ align32();
 753       __ BIND(L_main_loop_64bytes);
 754          copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
 755          copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
 756          copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
 757          __ addptr(temp4, loop_size[shift]);
 758          __ subq(temp1, loop_size[shift]);
 759          __ jcc(Assembler::greater, L_main_loop_64bytes);
 760 
 761       __ addq(temp1, loop_size[shift]);
 762       // Zero length check.
 763       __ jcc(Assembler::lessEqual, L_exit);
 764 
 765       __ BIND(L_tail64);
 766 
 767       // Tail handling using 64 byte [masked] vector copy operations.
 768       use64byteVector = true;
 769       arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 770                                    temp4, temp3, use64byteVector, L_entry, L_exit);
 771     }
 772     __ BIND(L_exit);
 773   }
 774 
 775   __ BIND(L_finish);
 776   address ucme_exit_pc = __ pc();
 777   // When called from generic_arraycopy r11 contains specific values
 778   // used during arraycopy epilogue, re-initializing r11.
 779   if (is_oop) {
 780     __ movq(r11, shift == 3 ? count : to);
 781   }
 782   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 783   restore_argument_regs(type);
 784   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 785   __ xorptr(rax, rax); // return 0
 786   __ vzeroupper();
 787   __ leave(); // required for proper stackwalking of RuntimeStub frame
 788   __ ret(0);
 789 
 790   if (MaxVectorSize == 64) {
 791     __ BIND(L_copy_large);
 792       UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
 793       arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
 794     __ jmp(L_finish);
 795   }
 796   return start;
 797 }
 798 
 799 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
 800                                          Register temp3, Register temp4, Register count,
 801                                          XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
 802                                          XMMRegister xmm4, int shift) {
 803 
 804   // Type(shift)           byte(0), short(1), int(2),   long(3)
 805   int loop_size[]        = { 256,     128,       64,      32};
 806   int threshold[]        = { 4096,    2048,     1024,    512};
 807 
 808   Label L_main_loop_large;
 809   Label L_tail_large;
 810   Label L_exit_large;
 811   Label L_entry_large;
 812   Label L_main_pre_loop_large;
 813   Label L_pre_main_post_large;
 814 
 815   assert(MaxVectorSize == 64, "vector length != 64");
 816   __ BIND(L_entry_large);
 817 
 818   __ BIND(L_pre_main_post_large);
 819   // Partial copy to make dst address 64 byte aligned.
 820   __ movq(temp2, to);
 821   __ andq(temp2, 63);
 822   __ jcc(Assembler::equal, L_main_pre_loop_large);
 823 
 824   __ negptr(temp2);
 825   __ addq(temp2, 64);
 826   if (shift) {
 827     __ shrq(temp2, shift);
 828   }
 829   __ movq(temp3, temp2);
 830   copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
 831   __ movq(temp4, temp2);
 832   __ movq(temp1, count);
 833   __ subq(temp1, temp2);
 834 
 835   __ cmpq(temp1, loop_size[shift]);
 836   __ jcc(Assembler::less, L_tail_large);
 837 
 838   __ BIND(L_main_pre_loop_large);
 839   __ subq(temp1, loop_size[shift]);
 840 
 841   // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
 842   __ align32();
 843   __ BIND(L_main_loop_large);
 844   copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
 845   __ addptr(temp4, loop_size[shift]);
 846   __ subq(temp1, loop_size[shift]);
 847   __ jcc(Assembler::greater, L_main_loop_large);
 848   // fence needed because copy256_avx3 uses non-temporal stores
 849   __ sfence();
 850 
 851   __ addq(temp1, loop_size[shift]);
 852   // Zero length check.
 853   __ jcc(Assembler::lessEqual, L_exit_large);
 854   __ BIND(L_tail_large);
 855   // Tail handling using 64 byte [masked] vector copy operations.
 856   __ cmpq(temp1, 0);
 857   __ jcc(Assembler::lessEqual, L_exit_large);
 858   arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
 859                                temp4, temp3, L_exit_large);
 860   __ BIND(L_exit_large);
 861 }
 862 
 863 // Inputs:
 864 //   c_rarg0   - source array address
 865 //   c_rarg1   - destination array address
 866 //   c_rarg2   - element count, treated as ssize_t, can be zero
 867 //
 868 //
 869 address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, address* entry, address nooverlap_target) {
 870   // aligned is always false -- x86_64 always uses the unaligned code
 871   const bool aligned = false;
 872   int shift;
 873   bool is_oop;
 874   bool dest_uninitialized;
 875 
 876   switch (stub_id) {
 877   case StubId::stubgen_jbyte_arraycopy_id:
 878     shift = 0;
 879     is_oop = false;
 880     dest_uninitialized = false;
 881     break;
 882   case StubId::stubgen_jshort_arraycopy_id:
 883     shift = 1;
 884     is_oop = false;
 885     dest_uninitialized = false;
 886     break;
 887   case StubId::stubgen_jint_arraycopy_id:
 888     shift = 2;
 889     is_oop = false;
 890     dest_uninitialized = false;
 891     break;
 892   case StubId::stubgen_jlong_arraycopy_id:
 893     shift = 3;
 894     is_oop = false;
 895     dest_uninitialized = false;
 896     break;
 897   case StubId::stubgen_oop_arraycopy_id:
 898     shift = (UseCompressedOops ? 2 : 3);
 899     is_oop = true;
 900     dest_uninitialized = false;
 901     break;
 902   case StubId::stubgen_oop_arraycopy_uninit_id:
 903     shift = (UseCompressedOops ? 2 : 3);
 904     is_oop = true;
 905     dest_uninitialized = true;
 906     break;
 907   default:
 908     ShouldNotReachHere();
 909   }
 910 
 911   __ align(CodeEntryAlignment);
 912   StubCodeMark mark(this, stub_id);
 913   address start = __ pc();
 914 
 915   bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
 916 
 917   Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 918   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 919   const Register from        = rdi;  // source array address
 920   const Register to          = rsi;  // destination array address
 921   const Register count       = rdx;  // elements count
 922   const Register temp1       = r8;
 923   const Register temp2       = rcx;
 924   const Register temp3       = r11;
 925   const Register temp4       = rax;
 926   // End pointers are inclusive, and if count is not zero they point
 927   // to the last unit copied:  end_to[0] := end_from[0]
 928 
 929   __ enter(); // required for proper stackwalking of RuntimeStub frame
 930   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 931 
 932   if (entry != nullptr) {
 933     *entry = __ pc();
 934      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 935     BLOCK_COMMENT("Entry:");
 936   }
 937 
 938   array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
 939 
 940   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 941   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 942 
 943   setup_argument_regs(type);
 944 
 945   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 946   if (dest_uninitialized) {
 947     decorators |= IS_DEST_UNINITIALIZED;
 948   }
 949   if (aligned) {
 950     decorators |= ARRAYCOPY_ALIGNED;
 951   }
 952   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 953   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 954   {
 955     // Type(shift)       byte(0), short(1), int(2),   long(3)
 956     int loop_size[]   = { 192,     96,       48,      24};
 957     int threshold[]   = { 4096,    2048,     1024,    512};
 958 
 959     // UnsafeMemoryAccess page error: continue after unsafe access
 960     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 961     // 'from', 'to' and 'count' are now valid
 962 
 963     // temp1 holds remaining count.
 964     __ movq(temp1, count);
 965 
 966     // Zero length check.
 967     __ BIND(L_tail);
 968     __ cmpq(temp1, 0);
 969     __ jcc(Assembler::lessEqual, L_exit);
 970 
 971     __ mov64(temp2, 0);
 972     __ movq(temp3, temp1);
 973     // Special cases using 32 byte [masked] vector copy operations.
 974     arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 975                                           temp4, use64byteVector, L_entry, L_exit);
 976 
 977     // PRE-MAIN-POST loop for aligned copy.
 978     __ BIND(L_entry);
 979 
 980     if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) {
 981       __ cmpq(temp1, threshold[shift]);
 982       __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 983     }
 984 
 985     if ((MaxVectorSize < 64)  || (CopyAVX3Threshold != 0)) {
 986       // Partial copy to make dst address 32 byte aligned.
 987       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 988       __ andq(temp2, 31);
 989       __ jcc(Assembler::equal, L_main_pre_loop);
 990 
 991       if (shift) {
 992         __ shrq(temp2, shift);
 993       }
 994       __ subq(temp1, temp2);
 995       copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
 996 
 997       __ cmpq(temp1, loop_size[shift]);
 998       __ jcc(Assembler::less, L_tail);
 999 
1000       __ BIND(L_main_pre_loop);
1001 
1002       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1003       __ align32();
1004       __ BIND(L_main_loop);
1005          copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1006          copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1007          copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1008          __ subptr(temp1, loop_size[shift]);
1009          __ cmpq(temp1, loop_size[shift]);
1010          __ jcc(Assembler::greater, L_main_loop);
1011 
1012       // Tail loop.
1013       __ jmp(L_tail);
1014     }
1015 
1016     if (MaxVectorSize > 32) {
1017       __ BIND(L_pre_main_post_64);
1018       // Partial copy to make dst address 64 byte aligned.
1019       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1020       __ andq(temp2, 63);
1021       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1022 
1023       if (shift) {
1024         __ shrq(temp2, shift);
1025       }
1026       __ subq(temp1, temp2);
1027       copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1028 
1029       __ cmpq(temp1, loop_size[shift]);
1030       __ jcc(Assembler::less, L_tail64);
1031 
1032       __ BIND(L_main_pre_loop_64bytes);
1033 
1034       // Main loop with aligned copy block size of 192 bytes at
1035       // 64 byte copy granularity.
1036       __ align32();
1037       __ BIND(L_main_loop_64bytes);
1038          copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1039          copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1040          copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1041          __ subq(temp1, loop_size[shift]);
1042          __ cmpq(temp1, loop_size[shift]);
1043          __ jcc(Assembler::greater, L_main_loop_64bytes);
1044 
1045       // Zero length check.
1046       __ cmpq(temp1, 0);
1047       __ jcc(Assembler::lessEqual, L_exit);
1048 
1049       __ BIND(L_tail64);
1050 
1051       // Tail handling using 64 byte [masked] vector copy operations.
1052       use64byteVector = true;
1053       __ mov64(temp2, 0);
1054       __ movq(temp3, temp1);
1055       arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1056                                             temp4, use64byteVector, L_entry, L_exit);
1057     }
1058     __ BIND(L_exit);
1059   }
1060   address ucme_exit_pc = __ pc();
1061   // When called from generic_arraycopy r11 contains specific values
1062   // used during arraycopy epilogue, re-initializing r11.
1063   if(is_oop) {
1064     __ movq(r11, count);
1065   }
1066   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1067   restore_argument_regs(type);
1068   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
1069   __ xorptr(rax, rax); // return 0
1070   __ vzeroupper();
1071   __ leave(); // required for proper stackwalking of RuntimeStub frame
1072   __ ret(0);
1073 
1074   return start;
1075 }
1076 
1077 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
1078                                                  Register to, Register count, int shift,
1079                                                  Register index, Register temp,
1080                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
1081   Label L_entry_64, L_entry_96, L_entry_128;
1082   Label L_entry_160, L_entry_192;
1083 
1084   int size_mat[][6] = {
1085   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1086   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1087   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1088   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1089   };
1090 
1091   // Case A) Special case for length less than equal to 32 bytes.
1092   __ cmpq(count, size_mat[shift][0]);
1093   __ jccb(Assembler::greater, L_entry_64);
1094   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
1095   __ jmp(L_exit);
1096 
1097   // Case B) Special case for length less than equal to 64 bytes.
1098   __ BIND(L_entry_64);
1099   __ cmpq(count, size_mat[shift][1]);
1100   __ jccb(Assembler::greater, L_entry_96);
1101   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1102   __ jmp(L_exit);
1103 
1104   // Case C) Special case for length less than equal to 96 bytes.
1105   __ BIND(L_entry_96);
1106   __ cmpq(count, size_mat[shift][2]);
1107   __ jccb(Assembler::greater, L_entry_128);
1108   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1109   __ subq(count, 64 >> shift);
1110   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1111   __ jmp(L_exit);
1112 
1113   // Case D) Special case for length less than equal to 128 bytes.
1114   __ BIND(L_entry_128);
1115   __ cmpq(count, size_mat[shift][3]);
1116   __ jccb(Assembler::greater, L_entry_160);
1117   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1118   copy32_avx(to, from, index, xmm, shift, 64);
1119   __ subq(count, 96 >> shift);
1120   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1121   __ jmp(L_exit);
1122 
1123   // Case E) Special case for length less than equal to 160 bytes.
1124   __ BIND(L_entry_160);
1125   __ cmpq(count, size_mat[shift][4]);
1126   __ jccb(Assembler::greater, L_entry_192);
1127   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1128   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1129   __ subq(count, 128 >> shift);
1130   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1131   __ jmp(L_exit);
1132 
1133   // Case F) Special case for length less than equal to 192 bytes.
1134   __ BIND(L_entry_192);
1135   __ cmpq(count, size_mat[shift][5]);
1136   __ jcc(Assembler::greater, L_entry);
1137   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1138   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1139   copy32_avx(to, from, index, xmm, shift, 128);
1140   __ subq(count, 160 >> shift);
1141   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1142   __ jmp(L_exit);
1143 }
1144 
1145 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1146                                                      Register to, Register count, int shift, Register index,
1147                                                      Register temp, Label& L_exit) {
1148   Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1149 
1150   int size_mat[][4] = {
1151   /* T_BYTE */ {64, 128, 192, 256},
1152   /* T_SHORT*/ {32, 64 , 96 , 128},
1153   /* T_INT  */ {16, 32 , 48 ,  64},
1154   /* T_LONG */ { 8, 16 , 24 ,  32}
1155   };
1156 
1157   assert(MaxVectorSize == 64, "vector length != 64");
1158   // Case A) Special case for length less than or equal to 64 bytes.
1159   __ BIND(L_entry_64);
1160   __ cmpq(count, size_mat[shift][0]);
1161   __ jccb(Assembler::greater, L_entry_128);
1162   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1163   __ jmp(L_exit);
1164 
1165   // Case B) Special case for length less than or equal to 128 bytes.
1166   __ BIND(L_entry_128);
1167   __ cmpq(count, size_mat[shift][1]);
1168   __ jccb(Assembler::greater, L_entry_192);
1169   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1170   __ subq(count, 64 >> shift);
1171   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1172   __ jmp(L_exit);
1173 
1174   // Case C) Special case for length less than or equal to 192 bytes.
1175   __ BIND(L_entry_192);
1176   __ cmpq(count, size_mat[shift][2]);
1177   __ jcc(Assembler::greater, L_entry_256);
1178   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1179   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1180   __ subq(count, 128 >> shift);
1181   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1182   __ jmp(L_exit);
1183 
1184   // Case D) Special case for length less than or equal to 256 bytes.
1185   __ BIND(L_entry_256);
1186   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1187   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1188   copy64_avx(to, from, index, xmm, false, shift, 128, true);
1189   __ subq(count, 192 >> shift);
1190   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1191   __ jmp(L_exit);
1192 }
1193 
1194 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1195                                                            Register to, Register start_index, Register end_index,
1196                                                            Register count, int shift, Register temp,
1197                                                            bool use64byteVector, Label& L_entry, Label& L_exit) {
1198   Label L_entry_64, L_entry_96, L_entry_128;
1199   Label L_entry_160, L_entry_192;
1200   bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
1201 
1202   int size_mat[][6] = {
1203   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1204   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1205   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1206   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1207   };
1208 
1209   // Case A) Special case for length less than equal to 32 bytes.
1210   __ cmpq(count, size_mat[shift][0]);
1211   __ jccb(Assembler::greater, L_entry_64);
1212   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1213   __ jmp(L_exit);
1214 
1215   // Case B) Special case for length less than equal to 64 bytes.
1216   __ BIND(L_entry_64);
1217   __ cmpq(count, size_mat[shift][1]);
1218   __ jccb(Assembler::greater, L_entry_96);
1219   if (avx3) {
1220      copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1221   } else {
1222      copy32_avx(to, from, end_index, xmm, shift, -32);
1223      __ subq(count, 32 >> shift);
1224      copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1225   }
1226   __ jmp(L_exit);
1227 
1228   // Case C) Special case for length less than equal to 96 bytes.
1229   __ BIND(L_entry_96);
1230   __ cmpq(count, size_mat[shift][2]);
1231   __ jccb(Assembler::greater, L_entry_128);
1232   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1233   __ subq(count, 64 >> shift);
1234   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1235   __ jmp(L_exit);
1236 
1237   // Case D) Special case for length less than equal to 128 bytes.
1238   __ BIND(L_entry_128);
1239   __ cmpq(count, size_mat[shift][3]);
1240   __ jccb(Assembler::greater, L_entry_160);
1241   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1242   copy32_avx(to, from, end_index, xmm, shift, -96);
1243   __ subq(count, 96 >> shift);
1244   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1245   __ jmp(L_exit);
1246 
1247   // Case E) Special case for length less than equal to 160 bytes.
1248   __ BIND(L_entry_160);
1249   __ cmpq(count, size_mat[shift][4]);
1250   __ jccb(Assembler::greater, L_entry_192);
1251   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1252   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1253   __ subq(count, 128 >> shift);
1254   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1255   __ jmp(L_exit);
1256 
1257   // Case F) Special case for length less than equal to 192 bytes.
1258   __ BIND(L_entry_192);
1259   __ cmpq(count, size_mat[shift][5]);
1260   __ jcc(Assembler::greater, L_entry);
1261   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1262   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1263   copy32_avx(to, from, end_index, xmm, shift, -160);
1264   __ subq(count, 160 >> shift);
1265   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1266   __ jmp(L_exit);
1267 }
1268 
1269 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1270                                 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1271                                 int shift, int offset) {
1272   if (MaxVectorSize == 64) {
1273     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1274     __ prefetcht0(Address(src, index, scale, offset + 0x200));
1275     __ prefetcht0(Address(src, index, scale, offset + 0x240));
1276     __ prefetcht0(Address(src, index, scale, offset + 0x280));
1277     __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1278 
1279     __ prefetcht0(Address(src, index, scale, offset + 0x400));
1280     __ prefetcht0(Address(src, index, scale, offset + 0x440));
1281     __ prefetcht0(Address(src, index, scale, offset + 0x480));
1282     __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1283 
1284     __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1285     __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1286     __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1287     __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1288 
1289     __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1290     __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1291     __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1292     __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1293   }
1294 }
1295 
1296 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1297                                        KRegister mask, Register length, Register index,
1298                                        Register temp, int shift, int offset,
1299                                        bool use64byteVector) {
1300   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1301   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1302   if (!use64byteVector) {
1303     copy32_avx(dst, src, index, xmm, shift, offset);
1304     __ subptr(length, 32 >> shift);
1305     copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1306   } else {
1307     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1308     assert(MaxVectorSize == 64, "vector length != 64");
1309     __ mov64(temp, -1L);
1310     __ bzhiq(temp, temp, length);
1311     __ kmovql(mask, temp);
1312     __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1313     __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1314   }
1315 }
1316 
1317 
1318 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1319                                        KRegister mask, Register length, Register index,
1320                                        Register temp, int shift, int offset) {
1321   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1322   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1323   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1324   __ mov64(temp, -1L);
1325   __ bzhiq(temp, temp, length);
1326   __ kmovql(mask, temp);
1327   __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1328   __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1329 }
1330 
1331 
1332 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1333                                 int shift, int offset) {
1334   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1335   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1336   __ vmovdqu(xmm, Address(src, index, scale, offset));
1337   __ vmovdqu(Address(dst, index, scale, offset), xmm);
1338 }
1339 
1340 
1341 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1342                                 bool conjoint, int shift, int offset, bool use64byteVector) {
1343   assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1344   if (!use64byteVector) {
1345     if (conjoint) {
1346       copy32_avx(dst, src, index, xmm, shift, offset+32);
1347       copy32_avx(dst, src, index, xmm, shift, offset);
1348     } else {
1349       copy32_avx(dst, src, index, xmm, shift, offset);
1350       copy32_avx(dst, src, index, xmm, shift, offset+32);
1351     }
1352   } else {
1353     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1354     __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1355     __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1356   }
1357 }
1358 
1359 #endif // COMPILER2_OR_JVMCI
1360 
1361 
1362 // Arguments:
1363 //   entry     - location for return of (post-push) entry
1364 //
1365 // Inputs:
1366 //   c_rarg0   - source array address
1367 //   c_rarg1   - destination array address
1368 //   c_rarg2   - element count, treated as ssize_t, can be zero
1369 //
1370 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1371 // we let the hardware handle it.  The one to eight bytes within words,
1372 // dwords or qwords that span cache line boundaries will still be loaded
1373 // and stored atomically.
1374 //
1375 // Side Effects:
1376 //   entry is set to the no-overlap entry point
1377 //   used by generate_conjoint_byte_copy().
1378 //
1379 address StubGenerator::generate_disjoint_byte_copy(address* entry) {
1380   StubId stub_id = StubId::stubgen_jbyte_disjoint_arraycopy_id;
1381   // aligned is always false -- x86_64 always uses the unaligned code
1382   const bool aligned = false;
1383 #if COMPILER2_OR_JVMCI
1384   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1385     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1386   }
1387 #endif
1388   __ align(CodeEntryAlignment);
1389   StubCodeMark mark(this, stub_id);
1390   address start = __ pc();
1391   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1392 
1393   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1394   Label L_copy_byte, L_exit;
1395   const Register from        = rdi;  // source array address
1396   const Register to          = rsi;  // destination array address
1397   const Register count       = rdx;  // elements count
1398   const Register byte_count  = rcx;
1399   const Register qword_count = count;
1400   const Register end_from    = from; // source array end address
1401   const Register end_to      = to;   // destination array end address
1402   // End pointers are inclusive, and if count is not zero they point
1403   // to the last unit copied:  end_to[0] := end_from[0]
1404 
1405   __ enter(); // required for proper stackwalking of RuntimeStub frame
1406   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1407 
1408   if (entry != nullptr) {
1409     *entry = __ pc();
1410      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1411     BLOCK_COMMENT("Entry:");
1412   }
1413 
1414   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1415                     // r9 and r10 may be used to save non-volatile registers
1416 
1417   {
1418     // UnsafeMemoryAccess page error: continue after unsafe access
1419     UnsafeMemoryAccessMark umam(this, !aligned, true);
1420     // 'from', 'to' and 'count' are now valid
1421     __ movptr(byte_count, count);
1422     __ shrptr(count, 3); // count => qword_count
1423 
1424     // Copy from low to high addresses.  Use 'to' as scratch.
1425     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1426     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1427     __ negptr(qword_count); // make the count negative
1428     __ jmp(L_copy_bytes);
1429 
1430     // Copy trailing qwords
1431   __ BIND(L_copy_8_bytes);
1432     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1433     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1434     __ increment(qword_count);
1435     __ jcc(Assembler::notZero, L_copy_8_bytes);
1436 
1437     // Check for and copy trailing dword
1438   __ BIND(L_copy_4_bytes);
1439     __ testl(byte_count, 4);
1440     __ jccb(Assembler::zero, L_copy_2_bytes);
1441     __ movl(rax, Address(end_from, 8));
1442     __ movl(Address(end_to, 8), rax);
1443 
1444     __ addptr(end_from, 4);
1445     __ addptr(end_to, 4);
1446 
1447     // Check for and copy trailing word
1448   __ BIND(L_copy_2_bytes);
1449     __ testl(byte_count, 2);
1450     __ jccb(Assembler::zero, L_copy_byte);
1451     __ movw(rax, Address(end_from, 8));
1452     __ movw(Address(end_to, 8), rax);
1453 
1454     __ addptr(end_from, 2);
1455     __ addptr(end_to, 2);
1456 
1457     // Check for and copy trailing byte
1458   __ BIND(L_copy_byte);
1459     __ testl(byte_count, 1);
1460     __ jccb(Assembler::zero, L_exit);
1461     __ movb(rax, Address(end_from, 8));
1462     __ movb(Address(end_to, 8), rax);
1463   }
1464 __ BIND(L_exit);
1465   address ucme_exit_pc = __ pc();
1466   restore_arg_regs();
1467   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1468   __ xorptr(rax, rax); // return 0
1469   __ vzeroupper();
1470   __ leave(); // required for proper stackwalking of RuntimeStub frame
1471   __ ret(0);
1472 
1473   {
1474     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1475     // Copy in multi-bytes chunks
1476     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1477     __ jmp(L_copy_4_bytes);
1478   }
1479   return start;
1480 }
1481 
1482 
1483 // Arguments:
1484 //   entry     - location for return of (post-push) entry
1485 //   nooverlap_target - entry to branch to if no overlap detected
1486 //
1487 // Inputs:
1488 //   c_rarg0   - source array address
1489 //   c_rarg1   - destination array address
1490 //   c_rarg2   - element count, treated as ssize_t, can be zero
1491 //
1492 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1493 // we let the hardware handle it.  The one to eight bytes within words,
1494 // dwords or qwords that span cache line boundaries will still be loaded
1495 // and stored atomically.
1496 //
1497 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) {
1498   StubId stub_id = StubId::stubgen_jbyte_arraycopy_id;
1499   // aligned is always false -- x86_64 always uses the unaligned code
1500   const bool aligned = false;
1501 #if COMPILER2_OR_JVMCI
1502   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1503     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1504   }
1505 #endif
1506   __ align(CodeEntryAlignment);
1507   StubCodeMark mark(this, stub_id);
1508   address start = __ pc();
1509   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1510 
1511   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1512   const Register from        = rdi;  // source array address
1513   const Register to          = rsi;  // destination array address
1514   const Register count       = rdx;  // elements count
1515   const Register byte_count  = rcx;
1516   const Register qword_count = count;
1517 
1518   __ enter(); // required for proper stackwalking of RuntimeStub frame
1519   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1520 
1521   if (entry != nullptr) {
1522     *entry = __ pc();
1523     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1524     BLOCK_COMMENT("Entry:");
1525   }
1526 
1527   array_overlap_test(nooverlap_target, Address::times_1);
1528   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1529                     // r9 and r10 may be used to save non-volatile registers
1530 
1531   {
1532     // UnsafeMemoryAccess page error: continue after unsafe access
1533     UnsafeMemoryAccessMark umam(this, !aligned, true);
1534     // 'from', 'to' and 'count' are now valid
1535     __ movptr(byte_count, count);
1536     __ shrptr(count, 3);   // count => qword_count
1537 
1538     // Copy from high to low addresses.
1539 
1540     // Check for and copy trailing byte
1541     __ testl(byte_count, 1);
1542     __ jcc(Assembler::zero, L_copy_2_bytes);
1543     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1544     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1545     __ decrement(byte_count); // Adjust for possible trailing word
1546 
1547     // Check for and copy trailing word
1548   __ BIND(L_copy_2_bytes);
1549     __ testl(byte_count, 2);
1550     __ jcc(Assembler::zero, L_copy_4_bytes);
1551     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1552     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1553 
1554     // Check for and copy trailing dword
1555   __ BIND(L_copy_4_bytes);
1556     __ testl(byte_count, 4);
1557     __ jcc(Assembler::zero, L_copy_bytes);
1558     __ movl(rax, Address(from, qword_count, Address::times_8));
1559     __ movl(Address(to, qword_count, Address::times_8), rax);
1560     __ jmp(L_copy_bytes);
1561 
1562     // Copy trailing qwords
1563   __ BIND(L_copy_8_bytes);
1564     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1565     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1566     __ decrement(qword_count);
1567     __ jcc(Assembler::notZero, L_copy_8_bytes);
1568   }
1569   restore_arg_regs();
1570   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1571   __ xorptr(rax, rax); // return 0
1572   __ vzeroupper();
1573   __ leave(); // required for proper stackwalking of RuntimeStub frame
1574   __ ret(0);
1575 
1576   {
1577     // UnsafeMemoryAccess page error: continue after unsafe access
1578     UnsafeMemoryAccessMark umam(this, !aligned, true);
1579     // Copy in multi-bytes chunks
1580     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1581   }
1582   restore_arg_regs();
1583   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1584   __ xorptr(rax, rax); // return 0
1585   __ vzeroupper();
1586   __ leave(); // required for proper stackwalking of RuntimeStub frame
1587   __ ret(0);
1588 
1589   return start;
1590 }
1591 
1592 
1593 // Arguments:
1594 //   entry     - location for return of (post-push) entry
1595 //
1596 // Inputs:
1597 //   c_rarg0   - source array address
1598 //   c_rarg1   - destination array address
1599 //   c_rarg2   - element count, treated as ssize_t, can be zero
1600 //
1601 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1602 // let the hardware handle it.  The two or four words within dwords
1603 // or qwords that span cache line boundaries will still be loaded
1604 // and stored atomically.
1605 //
1606 // Side Effects:
1607 //   entry is set to the no-overlap entry point
1608 //   used by generate_conjoint_short_copy().
1609 //
1610 address StubGenerator::generate_disjoint_short_copy(address *entry) {
1611   StubId stub_id = StubId::stubgen_jshort_disjoint_arraycopy_id;
1612   // aligned is always false -- x86_64 always uses the unaligned code
1613   const bool aligned = false;
1614 #if COMPILER2_OR_JVMCI
1615   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1616     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1617   }
1618 #endif
1619 
1620   __ align(CodeEntryAlignment);
1621   StubCodeMark mark(this, stub_id);
1622   address start = __ pc();
1623   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1624 
1625   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1626   const Register from        = rdi;  // source array address
1627   const Register to          = rsi;  // destination array address
1628   const Register count       = rdx;  // elements count
1629   const Register word_count  = rcx;
1630   const Register qword_count = count;
1631   const Register end_from    = from; // source array end address
1632   const Register end_to      = to;   // destination array end address
1633   // End pointers are inclusive, and if count is not zero they point
1634   // to the last unit copied:  end_to[0] := end_from[0]
1635 
1636   __ enter(); // required for proper stackwalking of RuntimeStub frame
1637   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1638 
1639   if (entry != nullptr) {
1640     *entry = __ pc();
1641     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1642     BLOCK_COMMENT("Entry:");
1643   }
1644 
1645   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1646                     // r9 and r10 may be used to save non-volatile registers
1647 
1648   {
1649     // UnsafeMemoryAccess page error: continue after unsafe access
1650     UnsafeMemoryAccessMark umam(this, !aligned, true);
1651     // 'from', 'to' and 'count' are now valid
1652     __ movptr(word_count, count);
1653     __ shrptr(count, 2); // count => qword_count
1654 
1655     // Copy from low to high addresses.  Use 'to' as scratch.
1656     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1657     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1658     __ negptr(qword_count);
1659     __ jmp(L_copy_bytes);
1660 
1661     // Copy trailing qwords
1662   __ BIND(L_copy_8_bytes);
1663     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1664     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1665     __ increment(qword_count);
1666     __ jcc(Assembler::notZero, L_copy_8_bytes);
1667 
1668     // Original 'dest' is trashed, so we can't use it as a
1669     // base register for a possible trailing word copy
1670 
1671     // Check for and copy trailing dword
1672   __ BIND(L_copy_4_bytes);
1673     __ testl(word_count, 2);
1674     __ jccb(Assembler::zero, L_copy_2_bytes);
1675     __ movl(rax, Address(end_from, 8));
1676     __ movl(Address(end_to, 8), rax);
1677 
1678     __ addptr(end_from, 4);
1679     __ addptr(end_to, 4);
1680 
1681     // Check for and copy trailing word
1682   __ BIND(L_copy_2_bytes);
1683     __ testl(word_count, 1);
1684     __ jccb(Assembler::zero, L_exit);
1685     __ movw(rax, Address(end_from, 8));
1686     __ movw(Address(end_to, 8), rax);
1687   }
1688 __ BIND(L_exit);
1689   address ucme_exit_pc = __ pc();
1690   restore_arg_regs();
1691   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1692   __ xorptr(rax, rax); // return 0
1693   __ vzeroupper();
1694   __ leave(); // required for proper stackwalking of RuntimeStub frame
1695   __ ret(0);
1696 
1697   {
1698     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1699     // Copy in multi-bytes chunks
1700     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1701     __ jmp(L_copy_4_bytes);
1702   }
1703 
1704   return start;
1705 }
1706 
1707 
1708 address StubGenerator::generate_fill(StubId stub_id) {
1709   BasicType t;
1710   bool aligned;
1711 
1712   switch (stub_id) {
1713   case StubId::stubgen_jbyte_fill_id:
1714     t = T_BYTE;
1715     aligned = false;
1716     break;
1717   case StubId::stubgen_jshort_fill_id:
1718     t = T_SHORT;
1719     aligned = false;
1720     break;
1721   case StubId::stubgen_jint_fill_id:
1722     t = T_INT;
1723     aligned = false;
1724     break;
1725   case StubId::stubgen_arrayof_jbyte_fill_id:
1726     t = T_BYTE;
1727     aligned = true;
1728     break;
1729   case StubId::stubgen_arrayof_jshort_fill_id:
1730     t = T_SHORT;
1731     aligned = true;
1732     break;
1733   case StubId::stubgen_arrayof_jint_fill_id:
1734     t = T_INT;
1735     aligned = true;
1736     break;
1737   default:
1738     ShouldNotReachHere();
1739   }
1740 
1741   __ align(CodeEntryAlignment);
1742   StubCodeMark mark(this, stub_id);
1743   address start = __ pc();
1744 
1745   BLOCK_COMMENT("Entry:");
1746 
1747   const Register to       = c_rarg0;  // destination array address
1748   const Register value    = c_rarg1;  // value
1749   const Register count    = c_rarg2;  // elements count
1750   __ mov(r11, count);
1751 
1752   __ enter(); // required for proper stackwalking of RuntimeStub frame
1753 
1754   {
1755     // Add set memory mark to protect against unsafe accesses faulting
1756     UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true);
1757     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1758   }
1759 
1760   __ vzeroupper();
1761   __ leave(); // required for proper stackwalking of RuntimeStub frame
1762   __ ret(0);
1763 
1764   return start;
1765 }
1766 
1767 
1768 // Arguments:
1769 //   entry     - location for return of (post-push) entry
1770 //   nooverlap_target - entry to branch to if no overlap detected
1771 //
1772 // Inputs:
1773 //   c_rarg0   - source array address
1774 //   c_rarg1   - destination array address
1775 //   c_rarg2   - element count, treated as ssize_t, can be zero
1776 //
1777 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1778 // let the hardware handle it.  The two or four words within dwords
1779 // or qwords that span cache line boundaries will still be loaded
1780 // and stored atomically.
1781 //
1782 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) {
1783   StubId stub_id = StubId::stubgen_jshort_arraycopy_id;
1784   // aligned is always false -- x86_64 always uses the unaligned code
1785   const bool aligned = false;
1786 #if COMPILER2_OR_JVMCI
1787   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1788     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1789   }
1790 #endif
1791 
1792   __ align(CodeEntryAlignment);
1793   StubCodeMark mark(this, stub_id);
1794   address start = __ pc();
1795   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1796 
1797   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1798   const Register from        = rdi;  // source array address
1799   const Register to          = rsi;  // destination array address
1800   const Register count       = rdx;  // elements count
1801   const Register word_count  = rcx;
1802   const Register qword_count = count;
1803 
1804   __ enter(); // required for proper stackwalking of RuntimeStub frame
1805   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1806 
1807   if (entry != nullptr) {
1808     *entry = __ pc();
1809     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1810     BLOCK_COMMENT("Entry:");
1811   }
1812 
1813   array_overlap_test(nooverlap_target, Address::times_2);
1814   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1815                     // r9 and r10 may be used to save non-volatile registers
1816 
1817   {
1818     // UnsafeMemoryAccess page error: continue after unsafe access
1819     UnsafeMemoryAccessMark umam(this, !aligned, true);
1820     // 'from', 'to' and 'count' are now valid
1821     __ movptr(word_count, count);
1822     __ shrptr(count, 2); // count => qword_count
1823 
1824     // Copy from high to low addresses.  Use 'to' as scratch.
1825 
1826     // Check for and copy trailing word
1827     __ testl(word_count, 1);
1828     __ jccb(Assembler::zero, L_copy_4_bytes);
1829     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1830     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1831 
1832    // Check for and copy trailing dword
1833   __ BIND(L_copy_4_bytes);
1834     __ testl(word_count, 2);
1835     __ jcc(Assembler::zero, L_copy_bytes);
1836     __ movl(rax, Address(from, qword_count, Address::times_8));
1837     __ movl(Address(to, qword_count, Address::times_8), rax);
1838     __ jmp(L_copy_bytes);
1839 
1840     // Copy trailing qwords
1841   __ BIND(L_copy_8_bytes);
1842     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1843     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1844     __ decrement(qword_count);
1845     __ jcc(Assembler::notZero, L_copy_8_bytes);
1846   }
1847   restore_arg_regs();
1848   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1849   __ xorptr(rax, rax); // return 0
1850   __ vzeroupper();
1851   __ leave(); // required for proper stackwalking of RuntimeStub frame
1852   __ ret(0);
1853 
1854   {
1855     // UnsafeMemoryAccess page error: continue after unsafe access
1856     UnsafeMemoryAccessMark umam(this, !aligned, true);
1857     // Copy in multi-bytes chunks
1858     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1859   }
1860   restore_arg_regs();
1861   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1862   __ xorptr(rax, rax); // return 0
1863   __ vzeroupper();
1864   __ leave(); // required for proper stackwalking of RuntimeStub frame
1865   __ ret(0);
1866 
1867   return start;
1868 }
1869 
1870 
1871 // Arguments:
1872 //   stub_id   - unqiue id for stub to generate
1873 //   entry     - location for return of (post-push) entry
1874 //   is_oop    - true => oop array, so generate store check code
1875 //
1876 // Inputs:
1877 //   c_rarg0   - source array address
1878 //   c_rarg1   - destination array address
1879 //   c_rarg2   - element count, treated as ssize_t, can be zero
1880 //
1881 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1882 // the hardware handle it.  The two dwords within qwords that span
1883 // cache line boundaries will still be loaded and stored atomically.
1884 //
1885 // Side Effects:
1886 //   disjoint_int_copy_entry is set to the no-overlap entry point
1887 //   used by generate_conjoint_int_oop_copy().
1888 //
1889 address StubGenerator::generate_disjoint_int_oop_copy(StubId stub_id, address* entry) {
1890   // aligned is always false -- x86_64 always uses the unaligned code
1891   const bool aligned = false;
1892   bool is_oop;
1893   bool dest_uninitialized;
1894   switch (stub_id) {
1895   case StubId::stubgen_jint_disjoint_arraycopy_id:
1896     is_oop = false;
1897     dest_uninitialized = false;
1898     break;
1899   case StubId::stubgen_oop_disjoint_arraycopy_id:
1900     assert(UseCompressedOops, "inconsistent oop copy size!");
1901     is_oop = true;
1902     dest_uninitialized = false;
1903     break;
1904   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1905     assert(UseCompressedOops, "inconsistent oop copy size!");
1906     is_oop = true;
1907     dest_uninitialized = true;
1908     break;
1909   default:
1910     ShouldNotReachHere();
1911   }
1912 
1913   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1914 #if COMPILER2_OR_JVMCI
1915   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1916     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1917   }
1918 #endif
1919 
1920   __ align(CodeEntryAlignment);
1921   StubCodeMark mark(this, stub_id);
1922   address start = __ pc();
1923 
1924   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1925   const Register from        = rdi;  // source array address
1926   const Register to          = rsi;  // destination array address
1927   const Register count       = rdx;  // elements count
1928   const Register dword_count = rcx;
1929   const Register qword_count = count;
1930   const Register end_from    = from; // source array end address
1931   const Register end_to      = to;   // destination array end address
1932   // End pointers are inclusive, and if count is not zero they point
1933   // to the last unit copied:  end_to[0] := end_from[0]
1934 
1935   __ enter(); // required for proper stackwalking of RuntimeStub frame
1936   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1937 
1938   if (entry != nullptr) {
1939     *entry = __ pc();
1940     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1941     BLOCK_COMMENT("Entry:");
1942   }
1943 
1944   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1945                                  // r9 is used to save r15_thread
1946 
1947   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1948   if (dest_uninitialized) {
1949     decorators |= IS_DEST_UNINITIALIZED;
1950   }
1951   if (aligned) {
1952     decorators |= ARRAYCOPY_ALIGNED;
1953   }
1954 
1955   BasicType type = is_oop ? T_OBJECT : T_INT;
1956   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1957 
1958   {
1959     // UnsafeMemoryAccess page error: continue after unsafe access
1960     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1961     // 'from', 'to' and 'count' are now valid
1962     __ movptr(dword_count, count);
1963     __ shrptr(count, 1); // count => qword_count
1964 
1965     // Copy from low to high addresses.  Use 'to' as scratch.
1966     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1967     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1968     __ negptr(qword_count);
1969     __ jmp(L_copy_bytes);
1970 
1971     // Copy trailing qwords
1972   __ BIND(L_copy_8_bytes);
1973     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1974     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1975     __ increment(qword_count);
1976     __ jcc(Assembler::notZero, L_copy_8_bytes);
1977 
1978     // Check for and copy trailing dword
1979   __ BIND(L_copy_4_bytes);
1980     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1981     __ jccb(Assembler::zero, L_exit);
1982     __ movl(rax, Address(end_from, 8));
1983     __ movl(Address(end_to, 8), rax);
1984   }
1985 __ BIND(L_exit);
1986   address ucme_exit_pc = __ pc();
1987   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1988   restore_arg_regs_using_thread();
1989   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1990   __ vzeroupper();
1991   __ xorptr(rax, rax); // return 0
1992   __ leave(); // required for proper stackwalking of RuntimeStub frame
1993   __ ret(0);
1994 
1995   {
1996     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
1997     // Copy in multi-bytes chunks
1998     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
1999     __ jmp(L_copy_4_bytes);
2000   }
2001 
2002   return start;
2003 }
2004 
2005 
2006 // Arguments:
2007 //   entry     - location for return of (post-push) entry
2008 //   nooverlap_target - entry to branch to if no overlap detected
2009 //   is_oop  - true => oop array, so generate store check code
2010 //
2011 // Inputs:
2012 //   c_rarg0   - source array address
2013 //   c_rarg1   - destination array address
2014 //   c_rarg2   - element count, treated as ssize_t, can be zero
2015 //
2016 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2017 // the hardware handle it.  The two dwords within qwords that span
2018 // cache line boundaries will still be loaded and stored atomically.
2019 //
2020 address StubGenerator::generate_conjoint_int_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2021   // aligned is always false -- x86_64 always uses the unaligned code
2022   const bool aligned = false;
2023   bool is_oop;
2024   bool dest_uninitialized;
2025   switch (stub_id) {
2026   case StubId::stubgen_jint_arraycopy_id:
2027     is_oop = false;
2028     dest_uninitialized = false;
2029     break;
2030   case StubId::stubgen_oop_arraycopy_id:
2031     assert(UseCompressedOops, "inconsistent oop copy size!");
2032     is_oop = true;
2033     dest_uninitialized = false;
2034     break;
2035   case StubId::stubgen_oop_arraycopy_uninit_id:
2036     assert(UseCompressedOops, "inconsistent oop copy size!");
2037     is_oop = true;
2038     dest_uninitialized = true;
2039     break;
2040   default:
2041     ShouldNotReachHere();
2042   }
2043 
2044   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2045 #if COMPILER2_OR_JVMCI
2046   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2047     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2048   }
2049 #endif
2050 
2051   __ align(CodeEntryAlignment);
2052   StubCodeMark mark(this, stub_id);
2053   address start = __ pc();
2054 
2055   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2056   const Register from        = rdi;  // source array address
2057   const Register to          = rsi;  // destination array address
2058   const Register count       = rdx;  // elements count
2059   const Register dword_count = rcx;
2060   const Register qword_count = count;
2061 
2062   __ enter(); // required for proper stackwalking of RuntimeStub frame
2063   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2064 
2065   if (entry != nullptr) {
2066     *entry = __ pc();
2067      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2068     BLOCK_COMMENT("Entry:");
2069   }
2070 
2071   array_overlap_test(nooverlap_target, Address::times_4);
2072   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2073                                  // r9 is used to save r15_thread
2074 
2075   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2076   if (dest_uninitialized) {
2077     decorators |= IS_DEST_UNINITIALIZED;
2078   }
2079   if (aligned) {
2080     decorators |= ARRAYCOPY_ALIGNED;
2081   }
2082 
2083   BasicType type = is_oop ? T_OBJECT : T_INT;
2084   // no registers are destroyed by this call
2085   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2086 
2087   assert_clean_int(count, rax); // Make sure 'count' is clean int.
2088   {
2089     // UnsafeMemoryAccess page error: continue after unsafe access
2090     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2091     // 'from', 'to' and 'count' are now valid
2092     __ movptr(dword_count, count);
2093     __ shrptr(count, 1); // count => qword_count
2094 
2095     // Copy from high to low addresses.  Use 'to' as scratch.
2096 
2097     // Check for and copy trailing dword
2098     __ testl(dword_count, 1);
2099     __ jcc(Assembler::zero, L_copy_bytes);
2100     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2101     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2102     __ jmp(L_copy_bytes);
2103 
2104     // Copy trailing qwords
2105   __ BIND(L_copy_8_bytes);
2106     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2107     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2108     __ decrement(qword_count);
2109     __ jcc(Assembler::notZero, L_copy_8_bytes);
2110   }
2111   if (is_oop) {
2112     __ jmp(L_exit);
2113   }
2114   restore_arg_regs_using_thread();
2115   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2116   __ xorptr(rax, rax); // return 0
2117   __ vzeroupper();
2118   __ leave(); // required for proper stackwalking of RuntimeStub frame
2119   __ ret(0);
2120 
2121   {
2122     // UnsafeMemoryAccess page error: continue after unsafe access
2123     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2124     // Copy in multi-bytes chunks
2125     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2126   }
2127 
2128 __ BIND(L_exit);
2129   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2130   restore_arg_regs_using_thread();
2131   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2132   __ xorptr(rax, rax); // return 0
2133   __ vzeroupper();
2134   __ leave(); // required for proper stackwalking of RuntimeStub frame
2135   __ ret(0);
2136 
2137   return start;
2138 }
2139 
2140 
2141 // Arguments:
2142 //   entry     - location for return of (post-push) entry
2143 //
2144 // Inputs:
2145 //   c_rarg0   - source array address
2146 //   c_rarg1   - destination array address
2147 //   c_rarg2   - element count, treated as ssize_t, can be zero
2148 //
2149  // Side Effects:
2150 //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2151 //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2152 //
2153 address StubGenerator::generate_disjoint_long_oop_copy(StubId stub_id, address *entry) {
2154   // aligned is always false -- x86_64 always uses the unaligned code
2155   const bool aligned = false;
2156   bool is_oop;
2157   bool dest_uninitialized;
2158   switch (stub_id) {
2159   case StubId::stubgen_jlong_disjoint_arraycopy_id:
2160     is_oop = false;
2161     dest_uninitialized = false;
2162     break;
2163   case StubId::stubgen_oop_disjoint_arraycopy_id:
2164     assert(!UseCompressedOops, "inconsistent oop copy size!");
2165     is_oop = true;
2166     dest_uninitialized = false;
2167     break;
2168   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2169     assert(!UseCompressedOops, "inconsistent oop copy size!");
2170     is_oop = true;
2171     dest_uninitialized = true;
2172     break;
2173   default:
2174     ShouldNotReachHere();
2175   }
2176 
2177   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2178 #if COMPILER2_OR_JVMCI
2179   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2180     return generate_disjoint_copy_avx3_masked(stub_id, entry);
2181   }
2182 #endif
2183 
2184   __ align(CodeEntryAlignment);
2185   StubCodeMark mark(this, stub_id);
2186   address start = __ pc();
2187 
2188   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2189   const Register from        = rdi;  // source array address
2190   const Register to          = rsi;  // destination array address
2191   const Register qword_count = rdx;  // elements count
2192   const Register end_from    = from; // source array end address
2193   const Register end_to      = rcx;  // destination array end address
2194   const Register saved_count = r11;
2195   // End pointers are inclusive, and if count is not zero they point
2196   // to the last unit copied:  end_to[0] := end_from[0]
2197 
2198   __ enter(); // required for proper stackwalking of RuntimeStub frame
2199   // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2200   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2201 
2202   if (entry != nullptr) {
2203     *entry = __ pc();
2204     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2205     BLOCK_COMMENT("Entry:");
2206   }
2207 
2208   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2209                                    // r9 is used to save r15_thread
2210   // 'from', 'to' and 'qword_count' are now valid
2211 
2212   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2213   if (dest_uninitialized) {
2214     decorators |= IS_DEST_UNINITIALIZED;
2215   }
2216   if (aligned) {
2217     decorators |= ARRAYCOPY_ALIGNED;
2218   }
2219 
2220   BasicType type = is_oop ? T_OBJECT : T_LONG;
2221   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2222   {
2223     // UnsafeMemoryAccess page error: continue after unsafe access
2224     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2225 
2226     // Copy from low to high addresses.  Use 'to' as scratch.
2227     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2228     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2229     __ negptr(qword_count);
2230     __ jmp(L_copy_bytes);
2231 
2232     // Copy trailing qwords
2233   __ BIND(L_copy_8_bytes);
2234     bs->copy_load_at(_masm, decorators, type, 8,
2235                      rax, Address(end_from, qword_count, Address::times_8, 8),
2236                      r10);
2237     bs->copy_store_at(_masm, decorators, type, 8,
2238                       Address(end_to, qword_count, Address::times_8, 8), rax,
2239                       r10);
2240     __ increment(qword_count);
2241     __ jcc(Assembler::notZero, L_copy_8_bytes);
2242   }
2243   if (is_oop) {
2244     __ jmp(L_exit);
2245   } else {
2246     restore_arg_regs_using_thread();
2247     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2248     __ xorptr(rax, rax); // return 0
2249     __ vzeroupper();
2250     __ leave(); // required for proper stackwalking of RuntimeStub frame
2251     __ ret(0);
2252   }
2253 
2254   {
2255     // UnsafeMemoryAccess page error: continue after unsafe access
2256     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2257     // Copy in multi-bytes chunks
2258     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2259   }
2260 
2261   __ BIND(L_exit);
2262   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2263   restore_arg_regs_using_thread();
2264   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2265                           SharedRuntime::_jlong_array_copy_ctr,
2266                  rscratch1); // Update counter after rscratch1 is free
2267   __ vzeroupper();
2268   __ xorptr(rax, rax); // return 0
2269   __ leave(); // required for proper stackwalking of RuntimeStub frame
2270   __ ret(0);
2271 
2272   return start;
2273 }
2274 
2275 
2276 // Arguments:
2277 //   entry     - location for return of (post-push) entry
2278 //   nooverlap_target - entry to branch to if no overlap detected
2279 //   is_oop  - true => oop array, so generate store check code
2280 //
2281 // Inputs:
2282 //   c_rarg0   - source array address
2283 //   c_rarg1   - destination array address
2284 //   c_rarg2   - element count, treated as ssize_t, can be zero
2285 //
2286 address StubGenerator::generate_conjoint_long_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2287   // aligned is always false -- x86_64 always uses the unaligned code
2288   const bool aligned = false;
2289   bool is_oop;
2290   bool dest_uninitialized;
2291   switch (stub_id) {
2292   case StubId::stubgen_jlong_arraycopy_id:
2293     is_oop = false;
2294     dest_uninitialized = false;
2295     break;
2296   case StubId::stubgen_oop_arraycopy_id:
2297     assert(!UseCompressedOops, "inconsistent oop copy size!");
2298     is_oop = true;
2299     dest_uninitialized = false;
2300     break;
2301   case StubId::stubgen_oop_arraycopy_uninit_id:
2302     assert(!UseCompressedOops, "inconsistent oop copy size!");
2303     is_oop = true;
2304     dest_uninitialized = true;
2305     break;
2306   default:
2307     ShouldNotReachHere();
2308   }
2309 
2310   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2311 #if COMPILER2_OR_JVMCI
2312   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2313     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2314   }
2315 #endif
2316 
2317   __ align(CodeEntryAlignment);
2318   StubCodeMark mark(this, stub_id);
2319   address start = __ pc();
2320 
2321   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2322   const Register from        = rdi;  // source array address
2323   const Register to          = rsi;  // destination array address
2324   const Register qword_count = rdx;  // elements count
2325   const Register saved_count = rcx;
2326 
2327   __ enter(); // required for proper stackwalking of RuntimeStub frame
2328   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2329 
2330   if (entry != nullptr) {
2331     *entry = __ pc();
2332     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2333     BLOCK_COMMENT("Entry:");
2334   }
2335 
2336   array_overlap_test(nooverlap_target, Address::times_8);
2337   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2338                                  // r9 is used to save r15_thread
2339   // 'from', 'to' and 'qword_count' are now valid
2340 
2341   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2342   if (dest_uninitialized) {
2343     decorators |= IS_DEST_UNINITIALIZED;
2344   }
2345   if (aligned) {
2346     decorators |= ARRAYCOPY_ALIGNED;
2347   }
2348 
2349   BasicType type = is_oop ? T_OBJECT : T_LONG;
2350   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2351   {
2352     // UnsafeMemoryAccess page error: continue after unsafe access
2353     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2354 
2355     __ jmp(L_copy_bytes);
2356 
2357     // Copy trailing qwords
2358   __ BIND(L_copy_8_bytes);
2359     bs->copy_load_at(_masm, decorators, type, 8,
2360                      rax, Address(from, qword_count, Address::times_8, -8),
2361                      r10);
2362     bs->copy_store_at(_masm, decorators, type, 8,
2363                       Address(to, qword_count, Address::times_8, -8), rax,
2364                       r10);
2365     __ decrement(qword_count);
2366     __ jcc(Assembler::notZero, L_copy_8_bytes);
2367   }
2368   if (is_oop) {
2369     __ jmp(L_exit);
2370   } else {
2371     restore_arg_regs_using_thread();
2372     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2373     __ xorptr(rax, rax); // return 0
2374     __ vzeroupper();
2375     __ leave(); // required for proper stackwalking of RuntimeStub frame
2376     __ ret(0);
2377   }
2378   {
2379     // UnsafeMemoryAccess page error: continue after unsafe access
2380     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2381 
2382     // Copy in multi-bytes chunks
2383     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2384   }
2385   __ BIND(L_exit);
2386   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2387   restore_arg_regs_using_thread();
2388   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2389                           SharedRuntime::_jlong_array_copy_ctr,
2390                  rscratch1); // Update counter after rscratch1 is free
2391   __ vzeroupper();
2392   __ xorptr(rax, rax); // return 0
2393   __ leave(); // required for proper stackwalking of RuntimeStub frame
2394   __ ret(0);
2395 
2396   return start;
2397 }
2398 
2399 
2400 // Helper for generating a dynamic type check.
2401 // Smashes no registers.
2402 void StubGenerator::generate_type_check(Register sub_klass,
2403                                         Register super_check_offset,
2404                                         Register super_klass,
2405                                         Label& L_success) {
2406   assert_different_registers(sub_klass, super_check_offset, super_klass);
2407 
2408   BLOCK_COMMENT("type_check:");
2409 
2410   Label L_miss;
2411 
2412   __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
2413                                    super_check_offset);
2414   __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2415 
2416   // Fall through on failure!
2417   __ BIND(L_miss);
2418 }
2419 
2420 //
2421 //  Generate checkcasting array copy stub
2422 //
2423 //  Input:
2424 //    c_rarg0   - source array address
2425 //    c_rarg1   - destination array address
2426 //    c_rarg2   - element count, treated as ssize_t, can be zero
2427 //    c_rarg3   - size_t ckoff (super_check_offset)
2428 // not Win64
2429 //    c_rarg4   - oop ckval (super_klass)
2430 // Win64
2431 //    rsp+40    - oop ckval (super_klass)
2432 //
2433 //  Output:
2434 //    rax ==  0  -  success
2435 //    rax == -1^K - failure, where K is partial transfer count
2436 //
2437 address StubGenerator::generate_checkcast_copy(StubId stub_id, address *entry) {
2438 
2439   bool dest_uninitialized;
2440   switch (stub_id) {
2441   case StubId::stubgen_checkcast_arraycopy_id:
2442     dest_uninitialized = false;
2443     break;
2444   case StubId::stubgen_checkcast_arraycopy_uninit_id:
2445     dest_uninitialized = true;
2446     break;
2447   default:
2448     ShouldNotReachHere();
2449   }
2450 
2451   Label L_load_element, L_store_element, L_do_card_marks, L_done;
2452 
2453   // Input registers (after setup_arg_regs)
2454   const Register from        = rdi;   // source array address
2455   const Register to          = rsi;   // destination array address
2456   const Register length      = rdx;   // elements count
2457   const Register ckoff       = rcx;   // super_check_offset
2458   const Register ckval       = r8;    // super_klass
2459 
2460   // Registers used as temps (r13, r14 are save-on-entry)
2461   const Register end_from    = from;  // source array end address
2462   const Register end_to      = r13;   // destination array end address
2463   const Register count       = rdx;   // -(count_remaining)
2464   const Register r14_length  = r14;   // saved copy of length
2465   // End pointers are inclusive, and if length is not zero they point
2466   // to the last unit copied:  end_to[0] := end_from[0]
2467 
2468   const Register rax_oop    = rax;    // actual oop copied
2469   const Register r11_klass  = r11;    // oop._klass
2470 
2471   //---------------------------------------------------------------
2472   // Assembler stub will be used for this call to arraycopy
2473   // if the two arrays are subtypes of Object[] but the
2474   // destination array type is not equal to or a supertype
2475   // of the source type.  Each element must be separately
2476   // checked.
2477 
2478   __ align(CodeEntryAlignment);
2479   StubCodeMark mark(this, stub_id);
2480   address start = __ pc();
2481 
2482   __ enter(); // required for proper stackwalking of RuntimeStub frame
2483 
2484 #ifdef ASSERT
2485   // caller guarantees that the arrays really are different
2486   // otherwise, we would have to make conjoint checks
2487   { Label L;
2488     array_overlap_test(L, TIMES_OOP);
2489     __ stop("checkcast_copy within a single array");
2490     __ bind(L);
2491   }
2492 #endif //ASSERT
2493 
2494   setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2495                                   // ckoff => rcx, ckval => r8
2496                                   // r9 is used to save r15_thread
2497 #ifdef _WIN64
2498   // last argument (#4) is on stack on Win64
2499   __ movptr(ckval, Address(rsp, 6 * wordSize));
2500 #endif
2501 
2502   // Caller of this entry point must set up the argument registers.
2503   if (entry != nullptr) {
2504     *entry = __ pc();
2505     BLOCK_COMMENT("Entry:");
2506   }
2507 
2508   // allocate spill slots for r13, r14
2509   enum {
2510     saved_r13_offset,
2511     saved_r14_offset,
2512     saved_r10_offset,
2513     saved_rbp_offset
2514   };
2515   __ subptr(rsp, saved_rbp_offset * wordSize);
2516   __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2517   __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2518   __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2519 
2520 #ifdef ASSERT
2521     Label L2;
2522     __ get_thread_slow(r14);
2523     __ cmpptr(r15_thread, r14);
2524     __ jcc(Assembler::equal, L2);
2525     __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2526     __ bind(L2);
2527 #endif // ASSERT
2528 
2529   // check that int operands are properly extended to size_t
2530   assert_clean_int(length, rax);
2531   assert_clean_int(ckoff, rax);
2532 
2533 #ifdef ASSERT
2534   BLOCK_COMMENT("assert consistent ckoff/ckval");
2535   // The ckoff and ckval must be mutually consistent,
2536   // even though caller generates both.
2537   { Label L;
2538     int sco_offset = in_bytes(Klass::super_check_offset_offset());
2539     __ cmpl(ckoff, Address(ckval, sco_offset));
2540     __ jcc(Assembler::equal, L);
2541     __ stop("super_check_offset inconsistent");
2542     __ bind(L);
2543   }
2544 #endif //ASSERT
2545 
2546   // Loop-invariant addresses.  They are exclusive end pointers.
2547   Address end_from_addr(from, length, TIMES_OOP, 0);
2548   Address   end_to_addr(to,   length, TIMES_OOP, 0);
2549   // Loop-variant addresses.  They assume post-incremented count < 0.
2550   Address from_element_addr(end_from, count, TIMES_OOP, 0);
2551   Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2552 
2553   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2554   if (dest_uninitialized) {
2555     decorators |= IS_DEST_UNINITIALIZED;
2556   }
2557 
2558   BasicType type = T_OBJECT;
2559   size_t element_size = UseCompressedOops ? 4 : 8;
2560 
2561   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2562   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2563 
2564   // Copy from low to high addresses, indexed from the end of each array.
2565   __ lea(end_from, end_from_addr);
2566   __ lea(end_to,   end_to_addr);
2567   __ movptr(r14_length, length);        // save a copy of the length
2568   assert(length == count, "");          // else fix next line:
2569   __ negptr(count);                     // negate and test the length
2570   __ jcc(Assembler::notZero, L_load_element);
2571 
2572   // Empty array:  Nothing to do.
2573   __ xorptr(rax, rax);                  // return 0 on (trivial) success
2574   __ jmp(L_done);
2575 
2576   // ======== begin loop ========
2577   // (Loop is rotated; its entry is L_load_element.)
2578   // Loop control:
2579   //   for (count = -count; count != 0; count++)
2580   // Base pointers src, dst are biased by 8*(count-1),to last element.
2581   __ align(OptoLoopAlignment);
2582 
2583   __ BIND(L_store_element);
2584   bs->copy_store_at(_masm,
2585                     decorators,
2586                     type,
2587                     element_size,
2588                     to_element_addr,
2589                     rax_oop,
2590                     r10);
2591   __ increment(count);               // increment the count toward zero
2592   __ jcc(Assembler::zero, L_do_card_marks);
2593 
2594   // ======== loop entry is here ========
2595   __ BIND(L_load_element);
2596   bs->copy_load_at(_masm,
2597                    decorators,
2598                    type,
2599                    element_size,
2600                    rax_oop,
2601                    from_element_addr,
2602                    r10);
2603   __ testptr(rax_oop, rax_oop);
2604   __ jcc(Assembler::zero, L_store_element);
2605 
2606   __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2607   generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2608   // ======== end loop ========
2609 
2610   // It was a real error; we must depend on the caller to finish the job.
2611   // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2612   // Emit GC store barriers for the oops we have copied (r14 + rdx),
2613   // and report their number to the caller.
2614   assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2615   Label L_post_barrier;
2616   __ addptr(r14_length, count);     // K = (original - remaining) oops
2617   __ movptr(rax, r14_length);       // save the value
2618   __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2619   __ jccb(Assembler::notZero, L_post_barrier);
2620   __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2621 
2622   // Come here on success only.
2623   __ BIND(L_do_card_marks);
2624   __ xorptr(rax, rax);              // return 0 on success
2625 
2626   __ BIND(L_post_barrier);
2627   bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2628 
2629   // Common exit point (success or failure).
2630   __ BIND(L_done);
2631   __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2632   __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2633   __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2634   restore_arg_regs_using_thread();
2635   INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2636   __ leave(); // required for proper stackwalking of RuntimeStub frame
2637   __ ret(0);
2638 
2639   return start;
2640 }
2641 
2642 
2643 //  Generate 'unsafe' array copy stub
2644 //  Though just as safe as the other stubs, it takes an unscaled
2645 //  size_t argument instead of an element count.
2646 //
2647 //  Input:
2648 //    c_rarg0   - source array address
2649 //    c_rarg1   - destination array address
2650 //    c_rarg2   - byte count, treated as ssize_t, can be zero
2651 //
2652 // Examines the alignment of the operands and dispatches
2653 // to a long, int, short, or byte copy loop.
2654 //
2655 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
2656                                             address int_copy_entry, address long_copy_entry) {
2657 
2658   Label L_long_aligned, L_int_aligned, L_short_aligned;
2659 
2660   // Input registers (before setup_arg_regs)
2661   const Register from        = c_rarg0;  // source array address
2662   const Register to          = c_rarg1;  // destination array address
2663   const Register size        = c_rarg2;  // byte count (size_t)
2664 
2665   // Register used as a temp
2666   const Register bits        = rax;      // test copy of low bits
2667 
2668   __ align(CodeEntryAlignment);
2669   StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2670   StubCodeMark mark(this, stub_id);
2671   address start = __ pc();
2672 
2673   __ enter(); // required for proper stackwalking of RuntimeStub frame
2674 
2675   // bump this on entry, not on exit:
2676   INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
2677 
2678   __ mov(bits, from);
2679   __ orptr(bits, to);
2680   __ orptr(bits, size);
2681 
2682   __ testb(bits, BytesPerLong-1);
2683   __ jccb(Assembler::zero, L_long_aligned);
2684 
2685   __ testb(bits, BytesPerInt-1);
2686   __ jccb(Assembler::zero, L_int_aligned);
2687 
2688   __ testb(bits, BytesPerShort-1);
2689   __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2690 
2691   __ BIND(L_short_aligned);
2692   __ shrptr(size, LogBytesPerShort); // size => short_count
2693   __ jump(RuntimeAddress(short_copy_entry));
2694 
2695   __ BIND(L_int_aligned);
2696   __ shrptr(size, LogBytesPerInt); // size => int_count
2697   __ jump(RuntimeAddress(int_copy_entry));
2698 
2699   __ BIND(L_long_aligned);
2700   __ shrptr(size, LogBytesPerLong); // size => qword_count
2701   __ jump(RuntimeAddress(long_copy_entry));
2702 
2703   return start;
2704 }
2705 
2706 
2707 // Static enum for helper
2708 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
2709 // Helper for generate_unsafe_setmemory
2710 //
2711 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
2712 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
2713                                      Register size, Register wide_value,
2714                                      Register tmp, Label& L_exit,
2715                                      MacroAssembler *_masm) {
2716   Label L_Loop, L_Tail, L_TailLoop;
2717 
2718   int shiftval = 0;
2719   int incr = 0;
2720 
2721   switch (type) {
2722     case USM_SHORT:
2723       shiftval = 1;
2724       incr = 16;
2725       break;
2726     case USM_DWORD:
2727       shiftval = 2;
2728       incr = 32;
2729       break;
2730     case USM_QUADWORD:
2731       shiftval = 3;
2732       incr = 64;
2733       break;
2734   }
2735 
2736   // At this point, we know the lower bits of size are zero
2737   __ shrq(size, shiftval);
2738   // size now has number of X-byte chunks (2, 4 or 8)
2739 
2740   // Number of (8*X)-byte chunks into tmp
2741   __ movq(tmp, size);
2742   __ shrq(tmp, 3);
2743   __ jccb(Assembler::zero, L_Tail);
2744 
2745   __ BIND(L_Loop);
2746 
2747   // Unroll 8 stores
2748   for (int i = 0; i < 8; i++) {
2749     switch (type) {
2750       case USM_SHORT:
2751         __ movw(Address(dest, (2 * i)), wide_value);
2752         break;
2753       case USM_DWORD:
2754         __ movl(Address(dest, (4 * i)), wide_value);
2755         break;
2756       case USM_QUADWORD:
2757         __ movq(Address(dest, (8 * i)), wide_value);
2758         break;
2759     }
2760   }
2761   __ addq(dest, incr);
2762   __ decrementq(tmp);
2763   __ jccb(Assembler::notZero, L_Loop);
2764 
2765   __ BIND(L_Tail);
2766 
2767   // Find number of remaining X-byte chunks
2768   __ andq(size, 0x7);
2769 
2770   // If zero, then we're done
2771   __ jccb(Assembler::zero, L_exit);
2772 
2773   __ BIND(L_TailLoop);
2774 
2775     switch (type) {
2776       case USM_SHORT:
2777         __ movw(Address(dest, 0), wide_value);
2778         break;
2779       case USM_DWORD:
2780         __ movl(Address(dest, 0), wide_value);
2781         break;
2782       case USM_QUADWORD:
2783         __ movq(Address(dest, 0), wide_value);
2784         break;
2785     }
2786   __ addq(dest, incr >> 3);
2787   __ decrementq(size);
2788   __ jccb(Assembler::notZero, L_TailLoop);
2789 }
2790 
2791 //  Generate 'unsafe' set memory stub
2792 //  Though just as safe as the other stubs, it takes an unscaled
2793 //  size_t (# bytes) argument instead of an element count.
2794 //
2795 //  Input:
2796 //    c_rarg0   - destination array address
2797 //    c_rarg1   - byte count (size_t)
2798 //    c_rarg2   - byte value
2799 //
2800 // Examines the alignment of the operands and dispatches
2801 // to an int, short, or byte fill loop.
2802 //
2803 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) {
2804   __ align(CodeEntryAlignment);
2805   StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
2806   StubCodeMark mark(this, stub_id);
2807   address start = __ pc();
2808   __ enter();   // required for proper stackwalking of RuntimeStub frame
2809 
2810   assert(unsafe_byte_fill != nullptr, "Invalid call");
2811 
2812   // bump this on entry, not on exit:
2813   INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
2814 
2815   {
2816     Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
2817 
2818     const Register dest = c_rarg0;
2819     const Register size = c_rarg1;
2820     const Register byteVal = c_rarg2;
2821     const Register wide_value = rax;
2822     const Register rScratch1 = r10;
2823 
2824     assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
2825 
2826     //     fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2827 
2828     __ testq(size, size);
2829     __ jcc(Assembler::zero, L_exit);
2830 
2831     // Propagate byte to full Register
2832     __ movzbl(rScratch1, byteVal);
2833     __ mov64(wide_value, 0x0101010101010101ULL);
2834     __ imulq(wide_value, rScratch1);
2835 
2836     // Check for pointer & size alignment
2837     __ movq(rScratch1, dest);
2838     __ orq(rScratch1, size);
2839 
2840     __ testb(rScratch1, 7);
2841     __ jcc(Assembler::equal, L_fillQuadwords);
2842 
2843     __ testb(rScratch1, 3);
2844     __ jcc(Assembler::equal, L_fillDwords);
2845 
2846     __ testb(rScratch1, 1);
2847     __ jcc(Assembler::notEqual, L_fillBytes);
2848 
2849     // Fill words
2850     {
2851       UnsafeMemoryAccessMark umam(this, true, true);
2852 
2853       // At this point, we know the lower bit of size is zero and a
2854       // multiple of 2
2855       do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
2856                                L_exit, _masm);
2857     }
2858     __ jmpb(L_exit);
2859 
2860     __ BIND(L_fillQuadwords);
2861 
2862     // Fill QUADWORDs
2863     {
2864       UnsafeMemoryAccessMark umam(this, true, true);
2865 
2866       // At this point, we know the lower 3 bits of size are zero and a
2867       // multiple of 8
2868       do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
2869                                L_exit, _masm);
2870     }
2871     __ BIND(L_exit);
2872 
2873     __ leave();   // required for proper stackwalking of RuntimeStub frame
2874     __ ret(0);
2875 
2876     __ BIND(L_fillDwords);
2877 
2878     // Fill DWORDs
2879     {
2880       UnsafeMemoryAccessMark umam(this, true, true);
2881 
2882       // At this point, we know the lower 2 bits of size are zero and a
2883       // multiple of 4
2884       do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
2885                                L_exit, _masm);
2886     }
2887     __ jmpb(L_exit);
2888 
2889     __ BIND(L_fillBytes);
2890     // Set up for tail call to previously generated byte fill routine
2891     // Parameter order is (ptr, byteVal, size)
2892     __ xchgq(c_rarg1, c_rarg2);
2893     __ leave();    // Clear effect of enter()
2894     __ jump(RuntimeAddress(unsafe_byte_fill));
2895   }
2896 
2897   return start;
2898 }
2899 
2900 // Perform range checks on the proposed arraycopy.
2901 // Kills temp, but nothing else.
2902 // Also, clean the sign bits of src_pos and dst_pos.
2903 void StubGenerator::arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2904                                            Register src_pos, // source position (c_rarg1)
2905                                            Register dst,     // destination array oo (c_rarg2)
2906                                            Register dst_pos, // destination position (c_rarg3)
2907                                            Register length,
2908                                            Register temp,
2909                                            Label& L_failed) {
2910   BLOCK_COMMENT("arraycopy_range_checks:");
2911 
2912   //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2913   __ movl(temp, length);
2914   __ addl(temp, src_pos);             // src_pos + length
2915   __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2916   __ jcc(Assembler::above, L_failed);
2917 
2918   //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2919   __ movl(temp, length);
2920   __ addl(temp, dst_pos);             // dst_pos + length
2921   __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2922   __ jcc(Assembler::above, L_failed);
2923 
2924   // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2925   // Move with sign extension can be used since they are positive.
2926   __ movslq(src_pos, src_pos);
2927   __ movslq(dst_pos, dst_pos);
2928 
2929   BLOCK_COMMENT("arraycopy_range_checks done");
2930 }
2931 
2932 
2933 //  Generate generic array copy stubs
2934 //
2935 //  Input:
2936 //    c_rarg0    -  src oop
2937 //    c_rarg1    -  src_pos (32-bits)
2938 //    c_rarg2    -  dst oop
2939 //    c_rarg3    -  dst_pos (32-bits)
2940 // not Win64
2941 //    c_rarg4    -  element count (32-bits)
2942 // Win64
2943 //    rsp+40     -  element count (32-bits)
2944 //
2945 //  Output:
2946 //    rax ==  0  -  success
2947 //    rax == -1^K - failure, where K is partial transfer count
2948 //
2949 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2950                                              address int_copy_entry, address oop_copy_entry,
2951                                              address long_copy_entry, address checkcast_copy_entry) {
2952 
2953   Label L_failed, L_failed_0, L_objArray;
2954   Label L_copy_shorts, L_copy_ints, L_copy_longs;
2955 
2956   // Input registers
2957   const Register src        = c_rarg0;  // source array oop
2958   const Register src_pos    = c_rarg1;  // source position
2959   const Register dst        = c_rarg2;  // destination array oop
2960   const Register dst_pos    = c_rarg3;  // destination position
2961 #ifndef _WIN64
2962   const Register length     = c_rarg4;
2963   const Register rklass_tmp = r9;  // load_klass
2964 #else
2965   const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
2966   const Register rklass_tmp = rdi;  // load_klass
2967 #endif
2968 
2969   { int modulus = CodeEntryAlignment;
2970     int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2971     int advance = target - (__ offset() % modulus);
2972     if (advance < 0)  advance += modulus;
2973     if (advance > 0)  __ nop(advance);
2974   }
2975   StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2976   StubCodeMark mark(this, stub_id);
2977 
2978   // Short-hop target to L_failed.  Makes for denser prologue code.
2979   __ BIND(L_failed_0);
2980   __ jmp(L_failed);
2981   assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2982 
2983   __ align(CodeEntryAlignment);
2984   address start = __ pc();
2985 
2986   __ enter(); // required for proper stackwalking of RuntimeStub frame
2987 
2988 #ifdef _WIN64
2989   __ push_ppx(rklass_tmp); // rdi is callee-save on Windows
2990 #endif
2991 
2992   // bump this on entry, not on exit:
2993   INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
2994 
2995   //-----------------------------------------------------------------------
2996   // Assembler stub will be used for this call to arraycopy
2997   // if the following conditions are met:
2998   //
2999   // (1) src and dst must not be null.
3000   // (2) src_pos must not be negative.
3001   // (3) dst_pos must not be negative.
3002   // (4) length  must not be negative.
3003   // (5) src klass and dst klass should be the same and not null.
3004   // (6) src and dst should be arrays.
3005   // (7) src_pos + length must not exceed length of src.
3006   // (8) dst_pos + length must not exceed length of dst.
3007   //
3008 
3009   //  if (src == nullptr) return -1;
3010   __ testptr(src, src);         // src oop
3011   size_t j1off = __ offset();
3012   __ jccb(Assembler::zero, L_failed_0);
3013 
3014   //  if (src_pos < 0) return -1;
3015   __ testl(src_pos, src_pos); // src_pos (32-bits)
3016   __ jccb(Assembler::negative, L_failed_0);
3017 
3018   //  if (dst == nullptr) return -1;
3019   __ testptr(dst, dst);         // dst oop
3020   __ jccb(Assembler::zero, L_failed_0);
3021 
3022   //  if (dst_pos < 0) return -1;
3023   __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3024   size_t j4off = __ offset();
3025   __ jccb(Assembler::negative, L_failed_0);
3026 
3027   // The first four tests are very dense code,
3028   // but not quite dense enough to put four
3029   // jumps in a 16-byte instruction fetch buffer.
3030   // That's good, because some branch predicters
3031   // do not like jumps so close together.
3032   // Make sure of this.
3033   guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3034 
3035   // registers used as temp
3036   const Register r11_length    = r11; // elements count to copy
3037   const Register r10_src_klass = r10; // array klass
3038 
3039   //  if (length < 0) return -1;
3040   __ movl(r11_length, length);        // length (elements count, 32-bits value)
3041   __ testl(r11_length, r11_length);
3042   __ jccb(Assembler::negative, L_failed_0);
3043 
3044   __ load_klass(r10_src_klass, src, rklass_tmp);
3045 #ifdef ASSERT
3046   //  assert(src->klass() != nullptr);
3047   {
3048     BLOCK_COMMENT("assert klasses not null {");
3049     Label L1, L2;
3050     __ testptr(r10_src_klass, r10_src_klass);
3051     __ jcc(Assembler::notZero, L2);   // it is broken if klass is null
3052     __ bind(L1);
3053     __ stop("broken null klass");
3054     __ bind(L2);
3055     __ load_klass(rax, dst, rklass_tmp);
3056     __ cmpq(rax, 0);
3057     __ jcc(Assembler::equal, L1);     // this would be broken also
3058     BLOCK_COMMENT("} assert klasses not null done");
3059   }
3060 #endif
3061 
3062   // Load layout helper (32-bits)
3063   //
3064   //  |array_tag|     | header_size | element_type |     |log2_element_size|
3065   // 32        30    24            16              8     2                 0
3066   //
3067   //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3068   //
3069 
3070   const int lh_offset = in_bytes(Klass::layout_helper_offset());
3071 
3072   // Handle objArrays completely differently...
3073   const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3074   __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3075   __ jcc(Assembler::equal, L_objArray);
3076 
3077   //  if (src->klass() != dst->klass()) return -1;
3078   __ load_klass(rax, dst, rklass_tmp);
3079   __ cmpq(r10_src_klass, rax);
3080   __ jcc(Assembler::notEqual, L_failed);
3081 
3082   // Check for flat inline type array -> return -1
3083   __ test_flat_array_oop(src, rax, L_failed);
3084 
3085   // Check for null-free (non-flat) inline type array -> handle as object array
3086   __ test_null_free_array_oop(src, rax, L_objArray);
3087 
3088   const Register rax_lh = rax;  // layout helper
3089   __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3090 
3091   // Check for flat inline type array -> return -1
3092   __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace);
3093   __ jcc(Assembler::notZero, L_failed);
3094 
3095   //  if (!src->is_Array()) return -1;
3096   __ cmpl(rax_lh, Klass::_lh_neutral_value);
3097   __ jcc(Assembler::greaterEqual, L_failed);
3098 
3099   // At this point, it is known to be a typeArray (array_tag 0x3).
3100 #ifdef ASSERT
3101   {
3102     BLOCK_COMMENT("assert primitive array {");
3103     Label L;
3104     __ movl(rklass_tmp, rax_lh);
3105     __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
3106     __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
3107     __ jcc(Assembler::equal, L);
3108     __ stop("must be a primitive array");
3109     __ bind(L);
3110     BLOCK_COMMENT("} assert primitive array done");
3111   }
3112 #endif
3113 
3114   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3115                          r10, L_failed);
3116 
3117   // TypeArrayKlass
3118   //
3119   // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3120   // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3121   //
3122 
3123   const Register r10_offset = r10;    // array offset
3124   const Register rax_elsize = rax_lh; // element size
3125 
3126   __ movl(r10_offset, rax_lh);
3127   __ shrl(r10_offset, Klass::_lh_header_size_shift);
3128   __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3129   __ addptr(src, r10_offset);           // src array offset
3130   __ addptr(dst, r10_offset);           // dst array offset
3131   BLOCK_COMMENT("choose copy loop based on element size");
3132   __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3133 
3134 #ifdef _WIN64
3135   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3136 #endif
3137 
3138   // next registers should be set before the jump to corresponding stub
3139   const Register from     = c_rarg0;  // source array address
3140   const Register to       = c_rarg1;  // destination array address
3141   const Register count    = c_rarg2;  // elements count
3142 
3143   // 'from', 'to', 'count' registers should be set in such order
3144   // since they are the same as 'src', 'src_pos', 'dst'.
3145 
3146   __ cmpl(rax_elsize, 0);
3147   __ jccb(Assembler::notEqual, L_copy_shorts);
3148   __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3149   __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3150   __ movl2ptr(count, r11_length); // length
3151   __ jump(RuntimeAddress(byte_copy_entry));
3152 
3153 __ BIND(L_copy_shorts);
3154   __ cmpl(rax_elsize, LogBytesPerShort);
3155   __ jccb(Assembler::notEqual, L_copy_ints);
3156   __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3157   __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3158   __ movl2ptr(count, r11_length); // length
3159   __ jump(RuntimeAddress(short_copy_entry));
3160 
3161 __ BIND(L_copy_ints);
3162   __ cmpl(rax_elsize, LogBytesPerInt);
3163   __ jccb(Assembler::notEqual, L_copy_longs);
3164   __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3165   __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3166   __ movl2ptr(count, r11_length); // length
3167   __ jump(RuntimeAddress(int_copy_entry));
3168 
3169 __ BIND(L_copy_longs);
3170 #ifdef ASSERT
3171   {
3172     BLOCK_COMMENT("assert long copy {");
3173     Label L;
3174     __ cmpl(rax_elsize, LogBytesPerLong);
3175     __ jcc(Assembler::equal, L);
3176     __ stop("must be long copy, but elsize is wrong");
3177     __ bind(L);
3178     BLOCK_COMMENT("} assert long copy done");
3179   }
3180 #endif
3181   __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3182   __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3183   __ movl2ptr(count, r11_length); // length
3184   __ jump(RuntimeAddress(long_copy_entry));
3185 
3186   // ObjArrayKlass
3187 __ BIND(L_objArray);
3188   // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3189 
3190   Label L_plain_copy, L_checkcast_copy;
3191   //  test array classes for subtyping
3192   __ load_klass(rax, dst, rklass_tmp);
3193   __ cmpq(r10_src_klass, rax); // usual case is exact equality
3194   __ jcc(Assembler::notEqual, L_checkcast_copy);
3195 
3196   // Identically typed arrays can be copied without element-wise checks.
3197   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3198                          r10, L_failed);
3199 
3200   __ lea(from, Address(src, src_pos, TIMES_OOP,
3201                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3202   __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3203                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3204   __ movl2ptr(count, r11_length); // length
3205 __ BIND(L_plain_copy);
3206 #ifdef _WIN64
3207   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3208 #endif
3209   __ jump(RuntimeAddress(oop_copy_entry));
3210 
3211 __ BIND(L_checkcast_copy);
3212   // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3213   {
3214     // Before looking at dst.length, make sure dst is also an objArray.
3215     // This check also fails for flat arrays which are not supported.
3216     __ cmpl(Address(rax, lh_offset), objArray_lh);
3217     __ jcc(Assembler::notEqual, L_failed);
3218 
3219 #ifdef ASSERT
3220     {
3221       BLOCK_COMMENT("assert not null-free array {");
3222       Label L;
3223       __ test_non_null_free_array_oop(dst, rklass_tmp, L);
3224       __ stop("unexpected null-free array");
3225       __ bind(L);
3226       BLOCK_COMMENT("} assert not null-free array");
3227     }
3228 #endif
3229 
3230     // It is safe to examine both src.length and dst.length.
3231     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3232                            rax, L_failed);
3233 
3234     const Register r11_dst_klass = r11;
3235     __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3236 
3237     // Marshal the base address arguments now, freeing registers.
3238     __ lea(from, Address(src, src_pos, TIMES_OOP,
3239                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3240     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3241                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3242     __ movl(count, length);           // length (reloaded)
3243     Register sco_temp = c_rarg3;      // this register is free now
3244     assert_different_registers(from, to, count, sco_temp,
3245                                r11_dst_klass, r10_src_klass);
3246     assert_clean_int(count, sco_temp);
3247 
3248     // Generate the type check.
3249     const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3250     __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3251     assert_clean_int(sco_temp, rax);
3252     generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3253 
3254     // Fetch destination element klass from the ObjArrayKlass header.
3255     int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3256     __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3257     __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3258     assert_clean_int(sco_temp, rax);
3259 
3260 #ifdef _WIN64
3261     __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3262 #endif
3263 
3264     // the checkcast_copy loop needs two extra arguments:
3265     assert(c_rarg3 == sco_temp, "#3 already in place");
3266     // Set up arguments for checkcast_copy_entry.
3267     setup_arg_regs_using_thread(4);
3268     __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3269     __ jump(RuntimeAddress(checkcast_copy_entry));
3270   }
3271 
3272 __ BIND(L_failed);
3273 #ifdef _WIN64
3274   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3275 #endif
3276   __ xorptr(rax, rax);
3277   __ notptr(rax); // return -1
3278   __ leave();   // required for proper stackwalking of RuntimeStub frame
3279   __ ret(0);
3280 
3281   return start;
3282 }
3283 
3284 #undef __