1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/macroAssembler.hpp"
  26 #include "gc/shared/barrierSet.hpp"
  27 #include "gc/shared/barrierSetAssembler.hpp"
  28 #include "oops/objArrayKlass.hpp"
  29 #include "runtime/sharedRuntime.hpp"
  30 #include "runtime/stubRoutines.hpp"
  31 #include "stubGenerator_x86_64.hpp"
  32 #ifdef COMPILER2
  33 #include "opto/c2_globals.hpp"
  34 #endif
  35 #if INCLUDE_JVMCI
  36 #include "jvmci/jvmci_globals.hpp"
  37 #endif
  38 
  39 #define __ _masm->
  40 
  41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #else
  46 #define BLOCK_COMMENT(str) __ block_comment(str)
  47 #endif // PRODUCT
  48 
  49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  50 
  51 #ifdef PRODUCT
  52 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
  53 #else
  54 #define INC_COUNTER_NP(counter, rscratch) \
  55 BLOCK_COMMENT("inc_counter " #counter); \
  56 inc_counter_np(_masm, counter, rscratch);
  57 
  58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
  59   __ incrementl(ExternalAddress((address)&counter), rscratch);
  60 }
  61 
  62 #if COMPILER2_OR_JVMCI
  63 static uint& get_profile_ctr(int shift) {
  64   if (shift == 0) {
  65     return SharedRuntime::_jbyte_array_copy_ctr;
  66   } else if (shift == 1) {
  67     return SharedRuntime::_jshort_array_copy_ctr;
  68   } else if (shift == 2) {
  69     return SharedRuntime::_jint_array_copy_ctr;
  70   } else {
  71     assert(shift == 3, "");
  72     return SharedRuntime::_jlong_array_copy_ctr;
  73   }
  74 }
  75 #endif // COMPILER2_OR_JVMCI
  76 #endif // !PRODUCT
  77 
  78 void StubGenerator::generate_arraycopy_stubs() {
  79   // Some copy stubs publish a normal entry and then a 2nd 'fallback'
  80   // entry immediately following their stack push. This can be used
  81   // as a post-push branch target for compatible stubs when they
  82   // identify a special case that can be handled by the fallback
  83   // stub e.g a disjoint copy stub may be use as a special case
  84   // fallback for its compatible conjoint copy stub.
  85   //
  86   // A no push entry is always returned in the following local and
  87   // then published by assigning to the appropriate entry field in
  88   // class StubRoutines. The entry value is then passed to the
  89   // generator for the compatible stub. That means the entry must be
  90   // listed when saving to/restoring from the AOT cache, ensuring
  91   // that the inter-stub jumps are noted at AOT-cache save and
  92   // relocated at AOT cache load.
  93   address nopush_entry;
  94 
  95   StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(&nopush_entry);
  96   // disjoint nopush entry is needed by conjoint copy
  97   StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
  98   StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
  99   // conjoint nopush entry is needed by generic/unsafe copy
 100   StubRoutines::_jbyte_arraycopy_nopush    = nopush_entry;
 101 
 102   StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
 103   // disjoint nopush entry is needed by conjoint copy
 104   StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
 105   StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 106   // conjoint nopush entry is needed by generic/unsafe copy
 107   StubRoutines::_jshort_arraycopy_nopush   = nopush_entry;
 108 
 109   StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 110   // disjoint nopush entry is needed by conjoint copy
 111   StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
 112   StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 113   // conjoint nopush entry is needed by generic/unsafe copy
 114   StubRoutines::_jint_arraycopy_nopush     = nopush_entry;
 115 
 116   StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
 117   // disjoint nopush entry is needed by conjoint copy
 118   StubRoutines::_jlong_disjoint_arraycopy_nopush  = nopush_entry;
 119   StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 120   // conjoint nopush entry is needed by generic/unsafe copy
 121   StubRoutines::_jlong_arraycopy_nopush    = nopush_entry;
 122 
 123   if (UseCompressedOops) {
 124     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
 125     // disjoint nopush entry is needed by conjoint copy
 126     StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
 127     StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
 128     // conjoint nopush entry is needed by generic/unsafe copy
 129     StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
 130     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 131     // disjoint nopush entry is needed by conjoint copy
 132     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
 133     // note that we don't need a returned nopush entry because the
 134     // generic/unsafe copy does not cater for uninit arrays.
 135     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 136   } else {
 137     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
 138     // disjoint nopush entry is needed by conjoint copy
 139     StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
 140     StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
 141     // conjoint nopush entry is needed by generic/unsafe copy
 142     StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
 143     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 144     // disjoint nopush entry is needed by conjoint copy
 145     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
 146     // note that we don't need a returned nopush entry because the
 147     // generic/unsafe copy does not cater for uninit arrays.
 148     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 149   }
 150 
 151   StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 152   // checkcast nopush entry is needed by generic copy
 153   StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 154   // note that we don't need a returned nopush entry because the
 155   // generic copy does not cater for uninit arrays.
 156   StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 157 
 158   StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 159                                                             StubRoutines::_jshort_arraycopy_nopush,
 160                                                             StubRoutines::_jint_arraycopy_nopush,
 161                                                             StubRoutines::_jlong_arraycopy_nopush);
 162   StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 163                                                              StubRoutines::_jshort_arraycopy_nopush,
 164                                                              StubRoutines::_jint_arraycopy_nopush,
 165                                                              StubRoutines::_oop_arraycopy_nopush,
 166                                                              StubRoutines::_jlong_arraycopy_nopush,
 167                                                              StubRoutines::_checkcast_arraycopy_nopush);
 168 
 169   StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 170   StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 171   StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 172   StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 173   StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 174   StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 175 
 176   StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
 177 
 178   // We don't generate specialized code for HeapWord-aligned source
 179   // arrays, so just use the code we've already generated
 180   StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
 181   StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
 182 
 183   StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
 184   StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
 185 
 186   StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
 187   StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
 188 
 189   StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
 190   StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
 191 
 192   StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
 193   StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
 194 
 195   StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
 196   StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
 197 }
 198 
 199 
 200 // Verify that a register contains clean 32-bits positive value
 201 // (high 32-bits are 0) so it could be used in 64-bits shifts.
 202 //
 203 //  Input:
 204 //    Rint  -  32-bits value
 205 //    Rtmp  -  scratch
 206 //
 207 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
 208 #ifdef ASSERT
 209   Label L;
 210   assert_different_registers(Rtmp, Rint);
 211   __ movslq(Rtmp, Rint);
 212   __ cmpq(Rtmp, Rint);
 213   __ jcc(Assembler::equal, L);
 214   __ stop("high 32-bits of int value are not 0");
 215   __ bind(L);
 216 #endif
 217 }
 218 
 219 
 220 //  Generate overlap test for array copy stubs
 221 //
 222 //  Input:
 223 //     c_rarg0 - from
 224 //     c_rarg1 - to
 225 //     c_rarg2 - element count
 226 //
 227 //  Output:
 228 //     rax   - &from[element count - 1]
 229 //
 230 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
 231   const Register from     = c_rarg0;
 232   const Register to       = c_rarg1;
 233   const Register count    = c_rarg2;
 234   const Register end_from = rax;
 235 
 236   __ cmpptr(to, from);
 237   __ lea(end_from, Address(from, count, sf, 0));
 238   if (NOLp == nullptr) {
 239     RuntimeAddress no_overlap(no_overlap_target);
 240     __ jump_cc(Assembler::belowEqual, no_overlap);
 241     __ cmpptr(to, end_from);
 242     __ jump_cc(Assembler::aboveEqual, no_overlap);
 243   } else {
 244     __ jcc(Assembler::belowEqual, (*NOLp));
 245     __ cmpptr(to, end_from);
 246     __ jcc(Assembler::aboveEqual, (*NOLp));
 247   }
 248 }
 249 
 250 
 251 // Copy big chunks forward
 252 //
 253 // Inputs:
 254 //   end_from     - source arrays end address
 255 //   end_to       - destination array end address
 256 //   qword_count  - 64-bits element count, negative
 257 //   tmp1         - scratch
 258 //   L_copy_bytes - entry label
 259 //   L_copy_8_bytes  - exit  label
 260 //
 261 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
 262                                        Register qword_count, Register tmp1,
 263                                        Register tmp2, Label& L_copy_bytes,
 264                                        Label& L_copy_8_bytes, DecoratorSet decorators,
 265                                        BasicType type) {
 266   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 267   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 268   Label L_loop;
 269   __ align(OptoLoopAlignment);
 270   if (UseUnalignedLoadStores) {
 271     Label L_end;
 272     __ BIND(L_loop);
 273     if (UseAVX >= 2) {
 274       bs->copy_load_at(_masm, decorators, type, 32,
 275                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 276                        tmp1, xmm1);
 277       bs->copy_store_at(_masm, decorators, type, 32,
 278                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 279                         tmp1, tmp2, xmm1);
 280 
 281       bs->copy_load_at(_masm, decorators, type, 32,
 282                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 283                        tmp1, xmm1);
 284       bs->copy_store_at(_masm, decorators, type, 32,
 285                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 286                         tmp1, tmp2, xmm1);
 287     } else {
 288       bs->copy_load_at(_masm, decorators, type, 16,
 289                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 290                        tmp1, xmm1);
 291       bs->copy_store_at(_masm, decorators, type, 16,
 292                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 293                         tmp1, tmp2, xmm1);
 294       bs->copy_load_at(_masm, decorators, type, 16,
 295                        xmm0, Address(end_from, qword_count, Address::times_8, -40),
 296                        tmp1, xmm1);
 297       bs->copy_store_at(_masm, decorators, type, 16,
 298                         Address(end_to, qword_count, Address::times_8, -40), xmm0,
 299                         tmp1, tmp2, xmm1);
 300       bs->copy_load_at(_masm, decorators, type, 16,
 301                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 302                        tmp1, xmm1);
 303       bs->copy_store_at(_masm, decorators, type, 16,
 304                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 305                         tmp1, tmp2, xmm1);
 306       bs->copy_load_at(_masm, decorators, type, 16,
 307                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 308                        tmp1, xmm1);
 309       bs->copy_store_at(_masm, decorators, type, 16,
 310                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 311                         tmp1, tmp2, xmm1);
 312     }
 313 
 314     __ BIND(L_copy_bytes);
 315     __ addptr(qword_count, 8);
 316     __ jcc(Assembler::lessEqual, L_loop);
 317     __ subptr(qword_count, 4);  // sub(8) and add(4)
 318     __ jcc(Assembler::greater, L_end);
 319     // Copy trailing 32 bytes
 320     if (UseAVX >= 2) {
 321       bs->copy_load_at(_masm, decorators, type, 32,
 322                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 323                        tmp1, xmm1);
 324       bs->copy_store_at(_masm, decorators, type, 32,
 325                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 326                         tmp1, tmp2, xmm1);
 327     } else {
 328       bs->copy_load_at(_masm, decorators, type, 16,
 329                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 330                        tmp1, xmm1);
 331       bs->copy_store_at(_masm, decorators, type, 16,
 332                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 333                         tmp1, tmp2, xmm1);
 334       bs->copy_load_at(_masm, decorators, type, 16,
 335                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 336                        tmp1, xmm1);
 337       bs->copy_store_at(_masm, decorators, type, 16,
 338                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 339                         tmp1, tmp2, xmm1);
 340     }
 341     __ addptr(qword_count, 4);
 342     __ BIND(L_end);
 343   } else {
 344     // Copy 32-bytes per iteration
 345     __ BIND(L_loop);
 346     bs->copy_load_at(_masm, decorators, type, 8,
 347                      tmp1, Address(end_from, qword_count, Address::times_8, -24),
 348                      tmp2);
 349     bs->copy_store_at(_masm, decorators, type, 8,
 350                       Address(end_to, qword_count, Address::times_8, -24), tmp1,
 351                       tmp2);
 352     bs->copy_load_at(_masm, decorators, type, 8,
 353                      tmp1, Address(end_from, qword_count, Address::times_8, -16),
 354                      tmp2);
 355     bs->copy_store_at(_masm, decorators, type, 8,
 356                       Address(end_to, qword_count, Address::times_8, -16), tmp1,
 357                       tmp2);
 358     bs->copy_load_at(_masm, decorators, type, 8,
 359                      tmp1, Address(end_from, qword_count, Address::times_8, -8),
 360                      tmp2);
 361     bs->copy_store_at(_masm, decorators, type, 8,
 362                       Address(end_to, qword_count, Address::times_8, -8), tmp1,
 363                       tmp2);
 364     bs->copy_load_at(_masm, decorators, type, 8,
 365                      tmp1, Address(end_from, qword_count, Address::times_8, 0),
 366                      tmp2);
 367     bs->copy_store_at(_masm, decorators, type, 8,
 368                       Address(end_to, qword_count, Address::times_8, 0), tmp1,
 369                       tmp2);
 370 
 371     __ BIND(L_copy_bytes);
 372     __ addptr(qword_count, 4);
 373     __ jcc(Assembler::lessEqual, L_loop);
 374   }
 375   __ subptr(qword_count, 4);
 376   __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
 377 }
 378 
 379 
 380 // Copy big chunks backward
 381 //
 382 // Inputs:
 383 //   from         - source arrays address
 384 //   dest         - destination array address
 385 //   qword_count  - 64-bits element count
 386 //   tmp1         - scratch
 387 //   L_copy_bytes - entry label
 388 //   L_copy_8_bytes  - exit  label
 389 //
 390 void StubGenerator::copy_bytes_backward(Register from, Register dest,
 391                                         Register qword_count, Register tmp1,
 392                                         Register tmp2, Label& L_copy_bytes,
 393                                         Label& L_copy_8_bytes, DecoratorSet decorators,
 394                                         BasicType type) {
 395   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 396   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 397   Label L_loop;
 398   __ align(OptoLoopAlignment);
 399   if (UseUnalignedLoadStores) {
 400     Label L_end;
 401     __ BIND(L_loop);
 402     if (UseAVX >= 2) {
 403       bs->copy_load_at(_masm, decorators, type, 32,
 404                        xmm0, Address(from, qword_count, Address::times_8, 32),
 405                        tmp1, xmm1);
 406       bs->copy_store_at(_masm, decorators, type, 32,
 407                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 408                         tmp1, tmp2, xmm1);
 409       bs->copy_load_at(_masm, decorators, type, 32,
 410                        xmm0, Address(from, qword_count, Address::times_8, 0),
 411                        tmp1, xmm1);
 412       bs->copy_store_at(_masm, decorators, type, 32,
 413                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 414                         tmp1, tmp2, xmm1);
 415     } else {
 416       bs->copy_load_at(_masm, decorators, type, 16,
 417                        xmm0, Address(from, qword_count, Address::times_8, 48),
 418                        tmp1, xmm1);
 419       bs->copy_store_at(_masm, decorators, type, 16,
 420                         Address(dest, qword_count, Address::times_8, 48), xmm0,
 421                         tmp1, tmp2, xmm1);
 422       bs->copy_load_at(_masm, decorators, type, 16,
 423                        xmm0, Address(from, qword_count, Address::times_8, 32),
 424                        tmp1, xmm1);
 425       bs->copy_store_at(_masm, decorators, type, 16,
 426                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 427                         tmp1, tmp2, xmm1);
 428       bs->copy_load_at(_masm, decorators, type, 16,
 429                        xmm0, Address(from, qword_count, Address::times_8, 16),
 430                        tmp1, xmm1);
 431       bs->copy_store_at(_masm, decorators, type, 16,
 432                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 433                         tmp1, tmp2, xmm1);
 434       bs->copy_load_at(_masm, decorators, type, 16,
 435                        xmm0, Address(from, qword_count, Address::times_8, 0),
 436                        tmp1, xmm1);
 437       bs->copy_store_at(_masm, decorators, type, 16,
 438                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 439                         tmp1, tmp2, xmm1);
 440     }
 441 
 442     __ BIND(L_copy_bytes);
 443     __ subptr(qword_count, 8);
 444     __ jcc(Assembler::greaterEqual, L_loop);
 445 
 446     __ addptr(qword_count, 4);  // add(8) and sub(4)
 447     __ jcc(Assembler::less, L_end);
 448     // Copy trailing 32 bytes
 449     if (UseAVX >= 2) {
 450       bs->copy_load_at(_masm, decorators, type, 32,
 451                        xmm0, Address(from, qword_count, Address::times_8, 0),
 452                        tmp1, xmm1);
 453       bs->copy_store_at(_masm, decorators, type, 32,
 454                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 455                         tmp1, tmp2, xmm1);
 456     } else {
 457       bs->copy_load_at(_masm, decorators, type, 16,
 458                        xmm0, Address(from, qword_count, Address::times_8, 16),
 459                        tmp1, xmm1);
 460       bs->copy_store_at(_masm, decorators, type, 16,
 461                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 462                         tmp1, tmp2, xmm1);
 463       bs->copy_load_at(_masm, decorators, type, 16,
 464                        xmm0, Address(from, qword_count, Address::times_8, 0),
 465                        tmp1, xmm1);
 466       bs->copy_store_at(_masm, decorators, type, 16,
 467                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 468                         tmp1, tmp2, xmm1);
 469     }
 470     __ subptr(qword_count, 4);
 471     __ BIND(L_end);
 472   } else {
 473     // Copy 32-bytes per iteration
 474     __ BIND(L_loop);
 475     bs->copy_load_at(_masm, decorators, type, 8,
 476                      tmp1, Address(from, qword_count, Address::times_8, 24),
 477                      tmp2);
 478     bs->copy_store_at(_masm, decorators, type, 8,
 479                       Address(dest, qword_count, Address::times_8, 24), tmp1,
 480                       tmp2);
 481     bs->copy_load_at(_masm, decorators, type, 8,
 482                      tmp1, Address(from, qword_count, Address::times_8, 16),
 483                      tmp2);
 484     bs->copy_store_at(_masm, decorators, type, 8,
 485                       Address(dest, qword_count, Address::times_8, 16), tmp1,
 486                       tmp2);
 487     bs->copy_load_at(_masm, decorators, type, 8,
 488                      tmp1, Address(from, qword_count, Address::times_8, 8),
 489                      tmp2);
 490     bs->copy_store_at(_masm, decorators, type, 8,
 491                       Address(dest, qword_count, Address::times_8, 8), tmp1,
 492                       tmp2);
 493     bs->copy_load_at(_masm, decorators, type, 8,
 494                      tmp1, Address(from, qword_count, Address::times_8, 0),
 495                      tmp2);
 496     bs->copy_store_at(_masm, decorators, type, 8,
 497                       Address(dest, qword_count, Address::times_8, 0), tmp1,
 498                       tmp2);
 499 
 500     __ BIND(L_copy_bytes);
 501     __ subptr(qword_count, 4);
 502     __ jcc(Assembler::greaterEqual, L_loop);
 503   }
 504   __ addptr(qword_count, 4);
 505   __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
 506 }
 507 
 508 #if COMPILER2_OR_JVMCI
 509 
 510 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
 511 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 512 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 513 //   default configuration.
 514 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 515 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 516 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 517 //   better performance for disjoint copies. For conjoint/backward copy vector based
 518 //   copy performs better.
 519 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
 520 //   64 byte vector registers (ZMMs).
 521 
 522 // Inputs:
 523 //   c_rarg0   - source array address
 524 //   c_rarg1   - destination array address
 525 //   c_rarg2   - element count, treated as ssize_t, can be zero
 526 //
 527 //
 528 // Side Effects:
 529 //   disjoint_copy_avx3_masked is set to the no-overlap entry point
 530 //   used by generate_conjoint_[byte/int/short/long]_copy().
 531 //
 532 address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry) {
 533   // aligned is always false -- x86_64 always uses the unaligned code
 534   const bool aligned = false;
 535   int shift;
 536   bool is_oop;
 537   bool dest_uninitialized;
 538 
 539   switch (stub_id) {
 540   case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 541     shift = 0;
 542     is_oop = false;
 543     dest_uninitialized = false;
 544     break;
 545   case StubId::stubgen_jshort_disjoint_arraycopy_id:
 546     shift = 1;
 547     is_oop = false;
 548     dest_uninitialized = false;
 549     break;
 550   case StubId::stubgen_jint_disjoint_arraycopy_id:
 551     shift = 2;
 552     is_oop = false;
 553     dest_uninitialized = false;
 554     break;
 555   case StubId::stubgen_jlong_disjoint_arraycopy_id:
 556     shift = 3;
 557     is_oop = false;
 558     dest_uninitialized = false;
 559     break;
 560   case StubId::stubgen_oop_disjoint_arraycopy_id:
 561     shift = (UseCompressedOops ? 2 : 3);
 562     is_oop = true;
 563     dest_uninitialized = false;
 564     break;
 565   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 566     shift = (UseCompressedOops ? 2 : 3);
 567     is_oop = true;
 568     dest_uninitialized = true;
 569     break;
 570   default:
 571     ShouldNotReachHere();
 572   }
 573 
 574   __ align(CodeEntryAlignment);
 575   StubCodeMark mark(this, stub_id);
 576   address start = __ pc();
 577 
 578   int avx3threshold = VM_Version::avx3_threshold();
 579   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 580   const int large_threshold = 2621440; // 2.5 MB
 581   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 582   Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 583   Label L_copy_large, L_finish;
 584   const Register from        = rdi;  // source array address
 585   const Register to          = rsi;  // destination array address
 586   const Register count       = rdx;  // elements count
 587   const Register temp1       = r8;
 588   const Register temp2       = r11;
 589   const Register temp3       = rax;
 590   const Register temp4       = rcx;
 591   // End pointers are inclusive, and if count is not zero they point
 592   // to the last unit copied:  end_to[0] := end_from[0]
 593 
 594   __ enter(); // required for proper stackwalking of RuntimeStub frame
 595   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 596 
 597   if (entry != nullptr) {
 598     *entry = __ pc();
 599      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 600     BLOCK_COMMENT("Entry:");
 601   }
 602 
 603   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 604   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 605 
 606   setup_argument_regs(type);
 607 
 608   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 609   if (dest_uninitialized) {
 610     decorators |= IS_DEST_UNINITIALIZED;
 611   }
 612   if (aligned) {
 613     decorators |= ARRAYCOPY_ALIGNED;
 614   }
 615   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 616   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 617 
 618   {
 619     // Type(shift)           byte(0), short(1), int(2),   long(3)
 620     int loop_size[]        = { 192,     96,       48,      24};
 621     int threshold[]        = { 4096,    2048,     1024,    512};
 622 
 623     // UnsafeMemoryAccess page error: continue after unsafe access
 624     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 625     // 'from', 'to' and 'count' are now valid
 626 
 627     // temp1 holds remaining count and temp4 holds running count used to compute
 628     // next address offset for start of to/from addresses (temp4 * scale).
 629     __ mov64(temp4, 0);
 630     __ movq(temp1, count);
 631 
 632     // Zero length check.
 633     __ BIND(L_tail);
 634     __ cmpq(temp1, 0);
 635     __ jcc(Assembler::lessEqual, L_exit);
 636 
 637     // Special cases using 32 byte [masked] vector copy operations.
 638     arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 639                                  temp4, temp3, use64byteVector, L_entry, L_exit);
 640 
 641     // PRE-MAIN-POST loop for aligned copy.
 642     __ BIND(L_entry);
 643 
 644     if (MaxVectorSize == 64) {
 645       __ movq(temp2, temp1);
 646       __ shlq(temp2, shift);
 647       __ cmpq(temp2, large_threshold);
 648       __ jcc(Assembler::greaterEqual, L_copy_large);
 649     }
 650     if (avx3threshold != 0) {
 651       __ cmpq(count, threshold[shift]);
 652       if (MaxVectorSize == 64) {
 653         // Copy using 64 byte vectors.
 654         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 655       } else {
 656         assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
 657         // REP MOVS offer a faster copy path.
 658         __ jcc(Assembler::greaterEqual, L_repmovs);
 659       }
 660     }
 661 
 662     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 663       // Partial copy to make dst address 32 byte aligned.
 664       __ movq(temp2, to);
 665       __ andq(temp2, 31);
 666       __ jcc(Assembler::equal, L_main_pre_loop);
 667 
 668       __ negptr(temp2);
 669       __ addq(temp2, 32);
 670       if (shift) {
 671         __ shrq(temp2, shift);
 672       }
 673       __ movq(temp3, temp2);
 674       copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
 675       __ movq(temp4, temp2);
 676       __ movq(temp1, count);
 677       __ subq(temp1, temp2);
 678 
 679       __ cmpq(temp1, loop_size[shift]);
 680       __ jcc(Assembler::less, L_tail);
 681 
 682       __ BIND(L_main_pre_loop);
 683       __ subq(temp1, loop_size[shift]);
 684 
 685       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 686       __ align32();
 687       __ BIND(L_main_loop);
 688          copy64_avx(to, from, temp4, xmm1, false, shift, 0);
 689          copy64_avx(to, from, temp4, xmm1, false, shift, 64);
 690          copy64_avx(to, from, temp4, xmm1, false, shift, 128);
 691          __ addptr(temp4, loop_size[shift]);
 692          __ subq(temp1, loop_size[shift]);
 693          __ jcc(Assembler::greater, L_main_loop);
 694 
 695       __ addq(temp1, loop_size[shift]);
 696 
 697       // Tail loop.
 698       __ jmp(L_tail);
 699 
 700       __ BIND(L_repmovs);
 701         __ movq(temp2, temp1);
 702         // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
 703         __ movq(temp3, to);
 704         __ movq(to,  from);
 705         __ movq(from, temp3);
 706         // Save to/from for restoration post rep_mov.
 707         __ movq(temp1, to);
 708         __ movq(temp3, from);
 709         if(shift < 3) {
 710           __ shrq(temp2, 3-shift);     // quad word count
 711         }
 712         __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
 713         __ rep_mov();
 714         __ shlq(temp2, 3);             // convert quad words into byte count.
 715         if(shift) {
 716           __ shrq(temp2, shift);       // type specific count.
 717         }
 718         // Restore original addresses in to/from.
 719         __ movq(to, temp3);
 720         __ movq(from, temp1);
 721         __ movq(temp4, temp2);
 722         __ movq(temp1, count);
 723         __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
 724         __ jmp(L_tail);
 725     }
 726 
 727     if (MaxVectorSize > 32) {
 728       __ BIND(L_pre_main_post_64);
 729       // Partial copy to make dst address 64 byte aligned.
 730       __ movq(temp2, to);
 731       __ andq(temp2, 63);
 732       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 733 
 734       __ negptr(temp2);
 735       __ addq(temp2, 64);
 736       if (shift) {
 737         __ shrq(temp2, shift);
 738       }
 739       __ movq(temp3, temp2);
 740       copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
 741       __ movq(temp4, temp2);
 742       __ movq(temp1, count);
 743       __ subq(temp1, temp2);
 744 
 745       __ cmpq(temp1, loop_size[shift]);
 746       __ jcc(Assembler::less, L_tail64);
 747 
 748       __ BIND(L_main_pre_loop_64bytes);
 749       __ subq(temp1, loop_size[shift]);
 750 
 751       // Main loop with aligned copy block size of 192 bytes at
 752       // 64 byte copy granularity.
 753       __ align32();
 754       __ BIND(L_main_loop_64bytes);
 755          copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
 756          copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
 757          copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
 758          __ addptr(temp4, loop_size[shift]);
 759          __ subq(temp1, loop_size[shift]);
 760          __ jcc(Assembler::greater, L_main_loop_64bytes);
 761 
 762       __ addq(temp1, loop_size[shift]);
 763       // Zero length check.
 764       __ jcc(Assembler::lessEqual, L_exit);
 765 
 766       __ BIND(L_tail64);
 767 
 768       // Tail handling using 64 byte [masked] vector copy operations.
 769       use64byteVector = true;
 770       arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 771                                    temp4, temp3, use64byteVector, L_entry, L_exit);
 772     }
 773     __ BIND(L_exit);
 774   }
 775 
 776   __ BIND(L_finish);
 777   address ucme_exit_pc = __ pc();
 778   // When called from generic_arraycopy r11 contains specific values
 779   // used during arraycopy epilogue, re-initializing r11.
 780   if (is_oop) {
 781     __ movq(r11, shift == 3 ? count : to);
 782   }
 783   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 784   restore_argument_regs(type);
 785   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 786   __ xorptr(rax, rax); // return 0
 787   __ vzeroupper();
 788   __ leave(); // required for proper stackwalking of RuntimeStub frame
 789   __ ret(0);
 790 
 791   if (MaxVectorSize == 64) {
 792     __ BIND(L_copy_large);
 793       UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
 794       arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
 795     __ jmp(L_finish);
 796   }
 797   return start;
 798 }
 799 
 800 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
 801                                          Register temp3, Register temp4, Register count,
 802                                          XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
 803                                          XMMRegister xmm4, int shift) {
 804 
 805   // Type(shift)           byte(0), short(1), int(2),   long(3)
 806   int loop_size[]        = { 256,     128,       64,      32};
 807   int threshold[]        = { 4096,    2048,     1024,    512};
 808 
 809   Label L_main_loop_large;
 810   Label L_tail_large;
 811   Label L_exit_large;
 812   Label L_entry_large;
 813   Label L_main_pre_loop_large;
 814   Label L_pre_main_post_large;
 815 
 816   assert(MaxVectorSize == 64, "vector length != 64");
 817   __ BIND(L_entry_large);
 818 
 819   __ BIND(L_pre_main_post_large);
 820   // Partial copy to make dst address 64 byte aligned.
 821   __ movq(temp2, to);
 822   __ andq(temp2, 63);
 823   __ jcc(Assembler::equal, L_main_pre_loop_large);
 824 
 825   __ negptr(temp2);
 826   __ addq(temp2, 64);
 827   if (shift) {
 828     __ shrq(temp2, shift);
 829   }
 830   __ movq(temp3, temp2);
 831   copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
 832   __ movq(temp4, temp2);
 833   __ movq(temp1, count);
 834   __ subq(temp1, temp2);
 835 
 836   __ cmpq(temp1, loop_size[shift]);
 837   __ jcc(Assembler::less, L_tail_large);
 838 
 839   __ BIND(L_main_pre_loop_large);
 840   __ subq(temp1, loop_size[shift]);
 841 
 842   // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
 843   __ align32();
 844   __ BIND(L_main_loop_large);
 845   copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
 846   __ addptr(temp4, loop_size[shift]);
 847   __ subq(temp1, loop_size[shift]);
 848   __ jcc(Assembler::greater, L_main_loop_large);
 849   // fence needed because copy256_avx3 uses non-temporal stores
 850   __ sfence();
 851 
 852   __ addq(temp1, loop_size[shift]);
 853   // Zero length check.
 854   __ jcc(Assembler::lessEqual, L_exit_large);
 855   __ BIND(L_tail_large);
 856   // Tail handling using 64 byte [masked] vector copy operations.
 857   __ cmpq(temp1, 0);
 858   __ jcc(Assembler::lessEqual, L_exit_large);
 859   arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
 860                                temp4, temp3, L_exit_large);
 861   __ BIND(L_exit_large);
 862 }
 863 
 864 // Inputs:
 865 //   c_rarg0   - source array address
 866 //   c_rarg1   - destination array address
 867 //   c_rarg2   - element count, treated as ssize_t, can be zero
 868 //
 869 //
 870 address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, address* entry, address nooverlap_target) {
 871   // aligned is always false -- x86_64 always uses the unaligned code
 872   const bool aligned = false;
 873   int shift;
 874   bool is_oop;
 875   bool dest_uninitialized;
 876 
 877   switch (stub_id) {
 878   case StubId::stubgen_jbyte_arraycopy_id:
 879     shift = 0;
 880     is_oop = false;
 881     dest_uninitialized = false;
 882     break;
 883   case StubId::stubgen_jshort_arraycopy_id:
 884     shift = 1;
 885     is_oop = false;
 886     dest_uninitialized = false;
 887     break;
 888   case StubId::stubgen_jint_arraycopy_id:
 889     shift = 2;
 890     is_oop = false;
 891     dest_uninitialized = false;
 892     break;
 893   case StubId::stubgen_jlong_arraycopy_id:
 894     shift = 3;
 895     is_oop = false;
 896     dest_uninitialized = false;
 897     break;
 898   case StubId::stubgen_oop_arraycopy_id:
 899     shift = (UseCompressedOops ? 2 : 3);
 900     is_oop = true;
 901     dest_uninitialized = false;
 902     break;
 903   case StubId::stubgen_oop_arraycopy_uninit_id:
 904     shift = (UseCompressedOops ? 2 : 3);
 905     is_oop = true;
 906     dest_uninitialized = true;
 907     break;
 908   default:
 909     ShouldNotReachHere();
 910   }
 911 
 912   __ align(CodeEntryAlignment);
 913   StubCodeMark mark(this, stub_id);
 914   address start = __ pc();
 915 
 916   int avx3threshold = VM_Version::avx3_threshold();
 917   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 918 
 919   Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 920   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 921   const Register from        = rdi;  // source array address
 922   const Register to          = rsi;  // destination array address
 923   const Register count       = rdx;  // elements count
 924   const Register temp1       = r8;
 925   const Register temp2       = rcx;
 926   const Register temp3       = r11;
 927   const Register temp4       = rax;
 928   // End pointers are inclusive, and if count is not zero they point
 929   // to the last unit copied:  end_to[0] := end_from[0]
 930 
 931   __ enter(); // required for proper stackwalking of RuntimeStub frame
 932   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 933 
 934   if (entry != nullptr) {
 935     *entry = __ pc();
 936      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 937     BLOCK_COMMENT("Entry:");
 938   }
 939 
 940   array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
 941 
 942   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 943   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 944 
 945   setup_argument_regs(type);
 946 
 947   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 948   if (dest_uninitialized) {
 949     decorators |= IS_DEST_UNINITIALIZED;
 950   }
 951   if (aligned) {
 952     decorators |= ARRAYCOPY_ALIGNED;
 953   }
 954   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 955   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 956   {
 957     // Type(shift)       byte(0), short(1), int(2),   long(3)
 958     int loop_size[]   = { 192,     96,       48,      24};
 959     int threshold[]   = { 4096,    2048,     1024,    512};
 960 
 961     // UnsafeMemoryAccess page error: continue after unsafe access
 962     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 963     // 'from', 'to' and 'count' are now valid
 964 
 965     // temp1 holds remaining count.
 966     __ movq(temp1, count);
 967 
 968     // Zero length check.
 969     __ BIND(L_tail);
 970     __ cmpq(temp1, 0);
 971     __ jcc(Assembler::lessEqual, L_exit);
 972 
 973     __ mov64(temp2, 0);
 974     __ movq(temp3, temp1);
 975     // Special cases using 32 byte [masked] vector copy operations.
 976     arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 977                                           temp4, use64byteVector, L_entry, L_exit);
 978 
 979     // PRE-MAIN-POST loop for aligned copy.
 980     __ BIND(L_entry);
 981 
 982     if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
 983       __ cmpq(temp1, threshold[shift]);
 984       __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 985     }
 986 
 987     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 988       // Partial copy to make dst address 32 byte aligned.
 989       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 990       __ andq(temp2, 31);
 991       __ jcc(Assembler::equal, L_main_pre_loop);
 992 
 993       if (shift) {
 994         __ shrq(temp2, shift);
 995       }
 996       __ subq(temp1, temp2);
 997       copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
 998 
 999       __ cmpq(temp1, loop_size[shift]);
1000       __ jcc(Assembler::less, L_tail);
1001 
1002       __ BIND(L_main_pre_loop);
1003 
1004       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1005       __ align32();
1006       __ BIND(L_main_loop);
1007          copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1008          copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1009          copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1010          __ subptr(temp1, loop_size[shift]);
1011          __ cmpq(temp1, loop_size[shift]);
1012          __ jcc(Assembler::greater, L_main_loop);
1013 
1014       // Tail loop.
1015       __ jmp(L_tail);
1016     }
1017 
1018     if (MaxVectorSize > 32) {
1019       __ BIND(L_pre_main_post_64);
1020       // Partial copy to make dst address 64 byte aligned.
1021       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1022       __ andq(temp2, 63);
1023       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1024 
1025       if (shift) {
1026         __ shrq(temp2, shift);
1027       }
1028       __ subq(temp1, temp2);
1029       copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1030 
1031       __ cmpq(temp1, loop_size[shift]);
1032       __ jcc(Assembler::less, L_tail64);
1033 
1034       __ BIND(L_main_pre_loop_64bytes);
1035 
1036       // Main loop with aligned copy block size of 192 bytes at
1037       // 64 byte copy granularity.
1038       __ align32();
1039       __ BIND(L_main_loop_64bytes);
1040          copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1041          copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1042          copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1043          __ subq(temp1, loop_size[shift]);
1044          __ cmpq(temp1, loop_size[shift]);
1045          __ jcc(Assembler::greater, L_main_loop_64bytes);
1046 
1047       // Zero length check.
1048       __ cmpq(temp1, 0);
1049       __ jcc(Assembler::lessEqual, L_exit);
1050 
1051       __ BIND(L_tail64);
1052 
1053       // Tail handling using 64 byte [masked] vector copy operations.
1054       use64byteVector = true;
1055       __ mov64(temp2, 0);
1056       __ movq(temp3, temp1);
1057       arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1058                                             temp4, use64byteVector, L_entry, L_exit);
1059     }
1060     __ BIND(L_exit);
1061   }
1062   address ucme_exit_pc = __ pc();
1063   // When called from generic_arraycopy r11 contains specific values
1064   // used during arraycopy epilogue, re-initializing r11.
1065   if(is_oop) {
1066     __ movq(r11, count);
1067   }
1068   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1069   restore_argument_regs(type);
1070   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
1071   __ xorptr(rax, rax); // return 0
1072   __ vzeroupper();
1073   __ leave(); // required for proper stackwalking of RuntimeStub frame
1074   __ ret(0);
1075 
1076   return start;
1077 }
1078 
1079 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
1080                                                  Register to, Register count, int shift,
1081                                                  Register index, Register temp,
1082                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
1083   Label L_entry_64, L_entry_96, L_entry_128;
1084   Label L_entry_160, L_entry_192;
1085 
1086   int size_mat[][6] = {
1087   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1088   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1089   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1090   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1091   };
1092 
1093   // Case A) Special case for length less than equal to 32 bytes.
1094   __ cmpq(count, size_mat[shift][0]);
1095   __ jccb(Assembler::greater, L_entry_64);
1096   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
1097   __ jmp(L_exit);
1098 
1099   // Case B) Special case for length less than equal to 64 bytes.
1100   __ BIND(L_entry_64);
1101   __ cmpq(count, size_mat[shift][1]);
1102   __ jccb(Assembler::greater, L_entry_96);
1103   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1104   __ jmp(L_exit);
1105 
1106   // Case C) Special case for length less than equal to 96 bytes.
1107   __ BIND(L_entry_96);
1108   __ cmpq(count, size_mat[shift][2]);
1109   __ jccb(Assembler::greater, L_entry_128);
1110   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1111   __ subq(count, 64 >> shift);
1112   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1113   __ jmp(L_exit);
1114 
1115   // Case D) Special case for length less than equal to 128 bytes.
1116   __ BIND(L_entry_128);
1117   __ cmpq(count, size_mat[shift][3]);
1118   __ jccb(Assembler::greater, L_entry_160);
1119   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1120   copy32_avx(to, from, index, xmm, shift, 64);
1121   __ subq(count, 96 >> shift);
1122   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1123   __ jmp(L_exit);
1124 
1125   // Case E) Special case for length less than equal to 160 bytes.
1126   __ BIND(L_entry_160);
1127   __ cmpq(count, size_mat[shift][4]);
1128   __ jccb(Assembler::greater, L_entry_192);
1129   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1130   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1131   __ subq(count, 128 >> shift);
1132   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1133   __ jmp(L_exit);
1134 
1135   // Case F) Special case for length less than equal to 192 bytes.
1136   __ BIND(L_entry_192);
1137   __ cmpq(count, size_mat[shift][5]);
1138   __ jcc(Assembler::greater, L_entry);
1139   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1140   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1141   copy32_avx(to, from, index, xmm, shift, 128);
1142   __ subq(count, 160 >> shift);
1143   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1144   __ jmp(L_exit);
1145 }
1146 
1147 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1148                                                      Register to, Register count, int shift, Register index,
1149                                                      Register temp, Label& L_exit) {
1150   Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1151 
1152   int size_mat[][4] = {
1153   /* T_BYTE */ {64, 128, 192, 256},
1154   /* T_SHORT*/ {32, 64 , 96 , 128},
1155   /* T_INT  */ {16, 32 , 48 ,  64},
1156   /* T_LONG */ { 8, 16 , 24 ,  32}
1157   };
1158 
1159   assert(MaxVectorSize == 64, "vector length != 64");
1160   // Case A) Special case for length less than or equal to 64 bytes.
1161   __ BIND(L_entry_64);
1162   __ cmpq(count, size_mat[shift][0]);
1163   __ jccb(Assembler::greater, L_entry_128);
1164   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1165   __ jmp(L_exit);
1166 
1167   // Case B) Special case for length less than or equal to 128 bytes.
1168   __ BIND(L_entry_128);
1169   __ cmpq(count, size_mat[shift][1]);
1170   __ jccb(Assembler::greater, L_entry_192);
1171   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1172   __ subq(count, 64 >> shift);
1173   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1174   __ jmp(L_exit);
1175 
1176   // Case C) Special case for length less than or equal to 192 bytes.
1177   __ BIND(L_entry_192);
1178   __ cmpq(count, size_mat[shift][2]);
1179   __ jcc(Assembler::greater, L_entry_256);
1180   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1181   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1182   __ subq(count, 128 >> shift);
1183   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1184   __ jmp(L_exit);
1185 
1186   // Case D) Special case for length less than or equal to 256 bytes.
1187   __ BIND(L_entry_256);
1188   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1189   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1190   copy64_avx(to, from, index, xmm, false, shift, 128, true);
1191   __ subq(count, 192 >> shift);
1192   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1193   __ jmp(L_exit);
1194 }
1195 
1196 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1197                                                            Register to, Register start_index, Register end_index,
1198                                                            Register count, int shift, Register temp,
1199                                                            bool use64byteVector, Label& L_entry, Label& L_exit) {
1200   Label L_entry_64, L_entry_96, L_entry_128;
1201   Label L_entry_160, L_entry_192;
1202   bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
1203 
1204   int size_mat[][6] = {
1205   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1206   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1207   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1208   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1209   };
1210 
1211   // Case A) Special case for length less than equal to 32 bytes.
1212   __ cmpq(count, size_mat[shift][0]);
1213   __ jccb(Assembler::greater, L_entry_64);
1214   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1215   __ jmp(L_exit);
1216 
1217   // Case B) Special case for length less than equal to 64 bytes.
1218   __ BIND(L_entry_64);
1219   __ cmpq(count, size_mat[shift][1]);
1220   __ jccb(Assembler::greater, L_entry_96);
1221   if (avx3) {
1222      copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1223   } else {
1224      copy32_avx(to, from, end_index, xmm, shift, -32);
1225      __ subq(count, 32 >> shift);
1226      copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1227   }
1228   __ jmp(L_exit);
1229 
1230   // Case C) Special case for length less than equal to 96 bytes.
1231   __ BIND(L_entry_96);
1232   __ cmpq(count, size_mat[shift][2]);
1233   __ jccb(Assembler::greater, L_entry_128);
1234   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1235   __ subq(count, 64 >> shift);
1236   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1237   __ jmp(L_exit);
1238 
1239   // Case D) Special case for length less than equal to 128 bytes.
1240   __ BIND(L_entry_128);
1241   __ cmpq(count, size_mat[shift][3]);
1242   __ jccb(Assembler::greater, L_entry_160);
1243   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1244   copy32_avx(to, from, end_index, xmm, shift, -96);
1245   __ subq(count, 96 >> shift);
1246   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1247   __ jmp(L_exit);
1248 
1249   // Case E) Special case for length less than equal to 160 bytes.
1250   __ BIND(L_entry_160);
1251   __ cmpq(count, size_mat[shift][4]);
1252   __ jccb(Assembler::greater, L_entry_192);
1253   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1254   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1255   __ subq(count, 128 >> shift);
1256   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1257   __ jmp(L_exit);
1258 
1259   // Case F) Special case for length less than equal to 192 bytes.
1260   __ BIND(L_entry_192);
1261   __ cmpq(count, size_mat[shift][5]);
1262   __ jcc(Assembler::greater, L_entry);
1263   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1264   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1265   copy32_avx(to, from, end_index, xmm, shift, -160);
1266   __ subq(count, 160 >> shift);
1267   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1268   __ jmp(L_exit);
1269 }
1270 
1271 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1272                                 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1273                                 int shift, int offset) {
1274   if (MaxVectorSize == 64) {
1275     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1276     __ prefetcht0(Address(src, index, scale, offset + 0x200));
1277     __ prefetcht0(Address(src, index, scale, offset + 0x240));
1278     __ prefetcht0(Address(src, index, scale, offset + 0x280));
1279     __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1280 
1281     __ prefetcht0(Address(src, index, scale, offset + 0x400));
1282     __ prefetcht0(Address(src, index, scale, offset + 0x440));
1283     __ prefetcht0(Address(src, index, scale, offset + 0x480));
1284     __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1285 
1286     __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1287     __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1288     __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1289     __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1290 
1291     __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1292     __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1293     __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1294     __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1295   }
1296 }
1297 
1298 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1299                                        KRegister mask, Register length, Register index,
1300                                        Register temp, int shift, int offset,
1301                                        bool use64byteVector) {
1302   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1303   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1304   if (!use64byteVector) {
1305     copy32_avx(dst, src, index, xmm, shift, offset);
1306     __ subptr(length, 32 >> shift);
1307     copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1308   } else {
1309     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1310     assert(MaxVectorSize == 64, "vector length != 64");
1311     __ mov64(temp, -1L);
1312     __ bzhiq(temp, temp, length);
1313     __ kmovql(mask, temp);
1314     __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1315     __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1316   }
1317 }
1318 
1319 
1320 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1321                                        KRegister mask, Register length, Register index,
1322                                        Register temp, int shift, int offset) {
1323   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1324   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1325   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1326   __ mov64(temp, -1L);
1327   __ bzhiq(temp, temp, length);
1328   __ kmovql(mask, temp);
1329   __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1330   __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1331 }
1332 
1333 
1334 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1335                                 int shift, int offset) {
1336   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1337   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1338   __ vmovdqu(xmm, Address(src, index, scale, offset));
1339   __ vmovdqu(Address(dst, index, scale, offset), xmm);
1340 }
1341 
1342 
1343 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1344                                 bool conjoint, int shift, int offset, bool use64byteVector) {
1345   assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1346   if (!use64byteVector) {
1347     if (conjoint) {
1348       copy32_avx(dst, src, index, xmm, shift, offset+32);
1349       copy32_avx(dst, src, index, xmm, shift, offset);
1350     } else {
1351       copy32_avx(dst, src, index, xmm, shift, offset);
1352       copy32_avx(dst, src, index, xmm, shift, offset+32);
1353     }
1354   } else {
1355     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1356     __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1357     __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1358   }
1359 }
1360 
1361 #endif // COMPILER2_OR_JVMCI
1362 
1363 
1364 // Arguments:
1365 //   entry     - location for return of (post-push) entry
1366 //
1367 // Inputs:
1368 //   c_rarg0   - source array address
1369 //   c_rarg1   - destination array address
1370 //   c_rarg2   - element count, treated as ssize_t, can be zero
1371 //
1372 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1373 // we let the hardware handle it.  The one to eight bytes within words,
1374 // dwords or qwords that span cache line boundaries will still be loaded
1375 // and stored atomically.
1376 //
1377 // Side Effects:
1378 //   entry is set to the no-overlap entry point
1379 //   used by generate_conjoint_byte_copy().
1380 //
1381 address StubGenerator::generate_disjoint_byte_copy(address* entry) {
1382   StubId stub_id = StubId::stubgen_jbyte_disjoint_arraycopy_id;
1383   // aligned is always false -- x86_64 always uses the unaligned code
1384   const bool aligned = false;
1385 #if COMPILER2_OR_JVMCI
1386   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1387     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1388   }
1389 #endif
1390   __ align(CodeEntryAlignment);
1391   StubCodeMark mark(this, stub_id);
1392   address start = __ pc();
1393   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1394 
1395   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1396   Label L_copy_byte, L_exit;
1397   const Register from        = rdi;  // source array address
1398   const Register to          = rsi;  // destination array address
1399   const Register count       = rdx;  // elements count
1400   const Register byte_count  = rcx;
1401   const Register qword_count = count;
1402   const Register end_from    = from; // source array end address
1403   const Register end_to      = to;   // destination array end address
1404   // End pointers are inclusive, and if count is not zero they point
1405   // to the last unit copied:  end_to[0] := end_from[0]
1406 
1407   __ enter(); // required for proper stackwalking of RuntimeStub frame
1408   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1409 
1410   if (entry != nullptr) {
1411     *entry = __ pc();
1412      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1413     BLOCK_COMMENT("Entry:");
1414   }
1415 
1416   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1417                     // r9 and r10 may be used to save non-volatile registers
1418 
1419   {
1420     // UnsafeMemoryAccess page error: continue after unsafe access
1421     UnsafeMemoryAccessMark umam(this, !aligned, true);
1422     // 'from', 'to' and 'count' are now valid
1423     __ movptr(byte_count, count);
1424     __ shrptr(count, 3); // count => qword_count
1425 
1426     // Copy from low to high addresses.  Use 'to' as scratch.
1427     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1428     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1429     __ negptr(qword_count); // make the count negative
1430     __ jmp(L_copy_bytes);
1431 
1432     // Copy trailing qwords
1433   __ BIND(L_copy_8_bytes);
1434     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1435     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1436     __ increment(qword_count);
1437     __ jcc(Assembler::notZero, L_copy_8_bytes);
1438 
1439     // Check for and copy trailing dword
1440   __ BIND(L_copy_4_bytes);
1441     __ testl(byte_count, 4);
1442     __ jccb(Assembler::zero, L_copy_2_bytes);
1443     __ movl(rax, Address(end_from, 8));
1444     __ movl(Address(end_to, 8), rax);
1445 
1446     __ addptr(end_from, 4);
1447     __ addptr(end_to, 4);
1448 
1449     // Check for and copy trailing word
1450   __ BIND(L_copy_2_bytes);
1451     __ testl(byte_count, 2);
1452     __ jccb(Assembler::zero, L_copy_byte);
1453     __ movw(rax, Address(end_from, 8));
1454     __ movw(Address(end_to, 8), rax);
1455 
1456     __ addptr(end_from, 2);
1457     __ addptr(end_to, 2);
1458 
1459     // Check for and copy trailing byte
1460   __ BIND(L_copy_byte);
1461     __ testl(byte_count, 1);
1462     __ jccb(Assembler::zero, L_exit);
1463     __ movb(rax, Address(end_from, 8));
1464     __ movb(Address(end_to, 8), rax);
1465   }
1466 __ BIND(L_exit);
1467   address ucme_exit_pc = __ pc();
1468   restore_arg_regs();
1469   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1470   __ xorptr(rax, rax); // return 0
1471   __ vzeroupper();
1472   __ leave(); // required for proper stackwalking of RuntimeStub frame
1473   __ ret(0);
1474 
1475   {
1476     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1477     // Copy in multi-bytes chunks
1478     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1479     __ jmp(L_copy_4_bytes);
1480   }
1481   return start;
1482 }
1483 
1484 
1485 // Arguments:
1486 //   entry     - location for return of (post-push) entry
1487 //   nooverlap_target - entry to branch to if no overlap detected
1488 //
1489 // Inputs:
1490 //   c_rarg0   - source array address
1491 //   c_rarg1   - destination array address
1492 //   c_rarg2   - element count, treated as ssize_t, can be zero
1493 //
1494 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1495 // we let the hardware handle it.  The one to eight bytes within words,
1496 // dwords or qwords that span cache line boundaries will still be loaded
1497 // and stored atomically.
1498 //
1499 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) {
1500   StubId stub_id = StubId::stubgen_jbyte_arraycopy_id;
1501   // aligned is always false -- x86_64 always uses the unaligned code
1502   const bool aligned = false;
1503 #if COMPILER2_OR_JVMCI
1504   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1505     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1506   }
1507 #endif
1508   __ align(CodeEntryAlignment);
1509   StubCodeMark mark(this, stub_id);
1510   address start = __ pc();
1511   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1512 
1513   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1514   const Register from        = rdi;  // source array address
1515   const Register to          = rsi;  // destination array address
1516   const Register count       = rdx;  // elements count
1517   const Register byte_count  = rcx;
1518   const Register qword_count = count;
1519 
1520   __ enter(); // required for proper stackwalking of RuntimeStub frame
1521   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1522 
1523   if (entry != nullptr) {
1524     *entry = __ pc();
1525     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1526     BLOCK_COMMENT("Entry:");
1527   }
1528 
1529   array_overlap_test(nooverlap_target, Address::times_1);
1530   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1531                     // r9 and r10 may be used to save non-volatile registers
1532 
1533   {
1534     // UnsafeMemoryAccess page error: continue after unsafe access
1535     UnsafeMemoryAccessMark umam(this, !aligned, true);
1536     // 'from', 'to' and 'count' are now valid
1537     __ movptr(byte_count, count);
1538     __ shrptr(count, 3);   // count => qword_count
1539 
1540     // Copy from high to low addresses.
1541 
1542     // Check for and copy trailing byte
1543     __ testl(byte_count, 1);
1544     __ jcc(Assembler::zero, L_copy_2_bytes);
1545     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1546     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1547     __ decrement(byte_count); // Adjust for possible trailing word
1548 
1549     // Check for and copy trailing word
1550   __ BIND(L_copy_2_bytes);
1551     __ testl(byte_count, 2);
1552     __ jcc(Assembler::zero, L_copy_4_bytes);
1553     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1554     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1555 
1556     // Check for and copy trailing dword
1557   __ BIND(L_copy_4_bytes);
1558     __ testl(byte_count, 4);
1559     __ jcc(Assembler::zero, L_copy_bytes);
1560     __ movl(rax, Address(from, qword_count, Address::times_8));
1561     __ movl(Address(to, qword_count, Address::times_8), rax);
1562     __ jmp(L_copy_bytes);
1563 
1564     // Copy trailing qwords
1565   __ BIND(L_copy_8_bytes);
1566     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1567     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1568     __ decrement(qword_count);
1569     __ jcc(Assembler::notZero, L_copy_8_bytes);
1570   }
1571   restore_arg_regs();
1572   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1573   __ xorptr(rax, rax); // return 0
1574   __ vzeroupper();
1575   __ leave(); // required for proper stackwalking of RuntimeStub frame
1576   __ ret(0);
1577 
1578   {
1579     // UnsafeMemoryAccess page error: continue after unsafe access
1580     UnsafeMemoryAccessMark umam(this, !aligned, true);
1581     // Copy in multi-bytes chunks
1582     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1583   }
1584   restore_arg_regs();
1585   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1586   __ xorptr(rax, rax); // return 0
1587   __ vzeroupper();
1588   __ leave(); // required for proper stackwalking of RuntimeStub frame
1589   __ ret(0);
1590 
1591   return start;
1592 }
1593 
1594 
1595 // Arguments:
1596 //   entry     - location for return of (post-push) entry
1597 //
1598 // Inputs:
1599 //   c_rarg0   - source array address
1600 //   c_rarg1   - destination array address
1601 //   c_rarg2   - element count, treated as ssize_t, can be zero
1602 //
1603 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1604 // let the hardware handle it.  The two or four words within dwords
1605 // or qwords that span cache line boundaries will still be loaded
1606 // and stored atomically.
1607 //
1608 // Side Effects:
1609 //   entry is set to the no-overlap entry point
1610 //   used by generate_conjoint_short_copy().
1611 //
1612 address StubGenerator::generate_disjoint_short_copy(address *entry) {
1613   StubId stub_id = StubId::stubgen_jshort_disjoint_arraycopy_id;
1614   // aligned is always false -- x86_64 always uses the unaligned code
1615   const bool aligned = false;
1616 #if COMPILER2_OR_JVMCI
1617   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1618     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1619   }
1620 #endif
1621 
1622   __ align(CodeEntryAlignment);
1623   StubCodeMark mark(this, stub_id);
1624   address start = __ pc();
1625   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1626 
1627   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1628   const Register from        = rdi;  // source array address
1629   const Register to          = rsi;  // destination array address
1630   const Register count       = rdx;  // elements count
1631   const Register word_count  = rcx;
1632   const Register qword_count = count;
1633   const Register end_from    = from; // source array end address
1634   const Register end_to      = to;   // destination array end address
1635   // End pointers are inclusive, and if count is not zero they point
1636   // to the last unit copied:  end_to[0] := end_from[0]
1637 
1638   __ enter(); // required for proper stackwalking of RuntimeStub frame
1639   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1640 
1641   if (entry != nullptr) {
1642     *entry = __ pc();
1643     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1644     BLOCK_COMMENT("Entry:");
1645   }
1646 
1647   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1648                     // r9 and r10 may be used to save non-volatile registers
1649 
1650   {
1651     // UnsafeMemoryAccess page error: continue after unsafe access
1652     UnsafeMemoryAccessMark umam(this, !aligned, true);
1653     // 'from', 'to' and 'count' are now valid
1654     __ movptr(word_count, count);
1655     __ shrptr(count, 2); // count => qword_count
1656 
1657     // Copy from low to high addresses.  Use 'to' as scratch.
1658     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1659     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1660     __ negptr(qword_count);
1661     __ jmp(L_copy_bytes);
1662 
1663     // Copy trailing qwords
1664   __ BIND(L_copy_8_bytes);
1665     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1666     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1667     __ increment(qword_count);
1668     __ jcc(Assembler::notZero, L_copy_8_bytes);
1669 
1670     // Original 'dest' is trashed, so we can't use it as a
1671     // base register for a possible trailing word copy
1672 
1673     // Check for and copy trailing dword
1674   __ BIND(L_copy_4_bytes);
1675     __ testl(word_count, 2);
1676     __ jccb(Assembler::zero, L_copy_2_bytes);
1677     __ movl(rax, Address(end_from, 8));
1678     __ movl(Address(end_to, 8), rax);
1679 
1680     __ addptr(end_from, 4);
1681     __ addptr(end_to, 4);
1682 
1683     // Check for and copy trailing word
1684   __ BIND(L_copy_2_bytes);
1685     __ testl(word_count, 1);
1686     __ jccb(Assembler::zero, L_exit);
1687     __ movw(rax, Address(end_from, 8));
1688     __ movw(Address(end_to, 8), rax);
1689   }
1690 __ BIND(L_exit);
1691   address ucme_exit_pc = __ pc();
1692   restore_arg_regs();
1693   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1694   __ xorptr(rax, rax); // return 0
1695   __ vzeroupper();
1696   __ leave(); // required for proper stackwalking of RuntimeStub frame
1697   __ ret(0);
1698 
1699   {
1700     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1701     // Copy in multi-bytes chunks
1702     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1703     __ jmp(L_copy_4_bytes);
1704   }
1705 
1706   return start;
1707 }
1708 
1709 
1710 address StubGenerator::generate_fill(StubId stub_id) {
1711   BasicType t;
1712   bool aligned;
1713 
1714   switch (stub_id) {
1715   case StubId::stubgen_jbyte_fill_id:
1716     t = T_BYTE;
1717     aligned = false;
1718     break;
1719   case StubId::stubgen_jshort_fill_id:
1720     t = T_SHORT;
1721     aligned = false;
1722     break;
1723   case StubId::stubgen_jint_fill_id:
1724     t = T_INT;
1725     aligned = false;
1726     break;
1727   case StubId::stubgen_arrayof_jbyte_fill_id:
1728     t = T_BYTE;
1729     aligned = true;
1730     break;
1731   case StubId::stubgen_arrayof_jshort_fill_id:
1732     t = T_SHORT;
1733     aligned = true;
1734     break;
1735   case StubId::stubgen_arrayof_jint_fill_id:
1736     t = T_INT;
1737     aligned = true;
1738     break;
1739   default:
1740     ShouldNotReachHere();
1741   }
1742 
1743   __ align(CodeEntryAlignment);
1744   StubCodeMark mark(this, stub_id);
1745   address start = __ pc();
1746 
1747   BLOCK_COMMENT("Entry:");
1748 
1749   const Register to       = c_rarg0;  // destination array address
1750   const Register value    = c_rarg1;  // value
1751   const Register count    = c_rarg2;  // elements count
1752   __ mov(r11, count);
1753 
1754   __ enter(); // required for proper stackwalking of RuntimeStub frame
1755 
1756   {
1757     // Add set memory mark to protect against unsafe accesses faulting
1758     UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true);
1759     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1760   }
1761 
1762   __ vzeroupper();
1763   __ leave(); // required for proper stackwalking of RuntimeStub frame
1764   __ ret(0);
1765 
1766   return start;
1767 }
1768 
1769 
1770 // Arguments:
1771 //   entry     - location for return of (post-push) entry
1772 //   nooverlap_target - entry to branch to if no overlap detected
1773 //
1774 // Inputs:
1775 //   c_rarg0   - source array address
1776 //   c_rarg1   - destination array address
1777 //   c_rarg2   - element count, treated as ssize_t, can be zero
1778 //
1779 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1780 // let the hardware handle it.  The two or four words within dwords
1781 // or qwords that span cache line boundaries will still be loaded
1782 // and stored atomically.
1783 //
1784 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) {
1785   StubId stub_id = StubId::stubgen_jshort_arraycopy_id;
1786   // aligned is always false -- x86_64 always uses the unaligned code
1787   const bool aligned = false;
1788 #if COMPILER2_OR_JVMCI
1789   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1790     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1791   }
1792 #endif
1793 
1794   __ align(CodeEntryAlignment);
1795   StubCodeMark mark(this, stub_id);
1796   address start = __ pc();
1797   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1798 
1799   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1800   const Register from        = rdi;  // source array address
1801   const Register to          = rsi;  // destination array address
1802   const Register count       = rdx;  // elements count
1803   const Register word_count  = rcx;
1804   const Register qword_count = count;
1805 
1806   __ enter(); // required for proper stackwalking of RuntimeStub frame
1807   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1808 
1809   if (entry != nullptr) {
1810     *entry = __ pc();
1811     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1812     BLOCK_COMMENT("Entry:");
1813   }
1814 
1815   array_overlap_test(nooverlap_target, Address::times_2);
1816   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1817                     // r9 and r10 may be used to save non-volatile registers
1818 
1819   {
1820     // UnsafeMemoryAccess page error: continue after unsafe access
1821     UnsafeMemoryAccessMark umam(this, !aligned, true);
1822     // 'from', 'to' and 'count' are now valid
1823     __ movptr(word_count, count);
1824     __ shrptr(count, 2); // count => qword_count
1825 
1826     // Copy from high to low addresses.  Use 'to' as scratch.
1827 
1828     // Check for and copy trailing word
1829     __ testl(word_count, 1);
1830     __ jccb(Assembler::zero, L_copy_4_bytes);
1831     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1832     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1833 
1834    // Check for and copy trailing dword
1835   __ BIND(L_copy_4_bytes);
1836     __ testl(word_count, 2);
1837     __ jcc(Assembler::zero, L_copy_bytes);
1838     __ movl(rax, Address(from, qword_count, Address::times_8));
1839     __ movl(Address(to, qword_count, Address::times_8), rax);
1840     __ jmp(L_copy_bytes);
1841 
1842     // Copy trailing qwords
1843   __ BIND(L_copy_8_bytes);
1844     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1845     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1846     __ decrement(qword_count);
1847     __ jcc(Assembler::notZero, L_copy_8_bytes);
1848   }
1849   restore_arg_regs();
1850   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1851   __ xorptr(rax, rax); // return 0
1852   __ vzeroupper();
1853   __ leave(); // required for proper stackwalking of RuntimeStub frame
1854   __ ret(0);
1855 
1856   {
1857     // UnsafeMemoryAccess page error: continue after unsafe access
1858     UnsafeMemoryAccessMark umam(this, !aligned, true);
1859     // Copy in multi-bytes chunks
1860     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1861   }
1862   restore_arg_regs();
1863   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1864   __ xorptr(rax, rax); // return 0
1865   __ vzeroupper();
1866   __ leave(); // required for proper stackwalking of RuntimeStub frame
1867   __ ret(0);
1868 
1869   return start;
1870 }
1871 
1872 
1873 // Arguments:
1874 //   stub_id   - unqiue id for stub to generate
1875 //   entry     - location for return of (post-push) entry
1876 //   is_oop    - true => oop array, so generate store check code
1877 //
1878 // Inputs:
1879 //   c_rarg0   - source array address
1880 //   c_rarg1   - destination array address
1881 //   c_rarg2   - element count, treated as ssize_t, can be zero
1882 //
1883 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1884 // the hardware handle it.  The two dwords within qwords that span
1885 // cache line boundaries will still be loaded and stored atomically.
1886 //
1887 // Side Effects:
1888 //   disjoint_int_copy_entry is set to the no-overlap entry point
1889 //   used by generate_conjoint_int_oop_copy().
1890 //
1891 address StubGenerator::generate_disjoint_int_oop_copy(StubId stub_id, address* entry) {
1892   // aligned is always false -- x86_64 always uses the unaligned code
1893   const bool aligned = false;
1894   bool is_oop;
1895   bool dest_uninitialized;
1896   switch (stub_id) {
1897   case StubId::stubgen_jint_disjoint_arraycopy_id:
1898     is_oop = false;
1899     dest_uninitialized = false;
1900     break;
1901   case StubId::stubgen_oop_disjoint_arraycopy_id:
1902     assert(UseCompressedOops, "inconsistent oop copy size!");
1903     is_oop = true;
1904     dest_uninitialized = false;
1905     break;
1906   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1907     assert(UseCompressedOops, "inconsistent oop copy size!");
1908     is_oop = true;
1909     dest_uninitialized = true;
1910     break;
1911   default:
1912     ShouldNotReachHere();
1913   }
1914 
1915   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1916 #if COMPILER2_OR_JVMCI
1917   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1918     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1919   }
1920 #endif
1921 
1922   __ align(CodeEntryAlignment);
1923   StubCodeMark mark(this, stub_id);
1924   address start = __ pc();
1925 
1926   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1927   const Register from        = rdi;  // source array address
1928   const Register to          = rsi;  // destination array address
1929   const Register count       = rdx;  // elements count
1930   const Register dword_count = rcx;
1931   const Register qword_count = count;
1932   const Register end_from    = from; // source array end address
1933   const Register end_to      = to;   // destination array end address
1934   // End pointers are inclusive, and if count is not zero they point
1935   // to the last unit copied:  end_to[0] := end_from[0]
1936 
1937   __ enter(); // required for proper stackwalking of RuntimeStub frame
1938   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1939 
1940   if (entry != nullptr) {
1941     *entry = __ pc();
1942     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1943     BLOCK_COMMENT("Entry:");
1944   }
1945 
1946   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1947                                  // r9 is used to save r15_thread
1948 
1949   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1950   if (dest_uninitialized) {
1951     decorators |= IS_DEST_UNINITIALIZED;
1952   }
1953   if (aligned) {
1954     decorators |= ARRAYCOPY_ALIGNED;
1955   }
1956 
1957   BasicType type = is_oop ? T_OBJECT : T_INT;
1958   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1959 
1960   {
1961     // UnsafeMemoryAccess page error: continue after unsafe access
1962     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1963     // 'from', 'to' and 'count' are now valid
1964     __ movptr(dword_count, count);
1965     __ shrptr(count, 1); // count => qword_count
1966 
1967     // Copy from low to high addresses.  Use 'to' as scratch.
1968     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1969     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1970     __ negptr(qword_count);
1971     __ jmp(L_copy_bytes);
1972 
1973     // Copy trailing qwords
1974   __ BIND(L_copy_8_bytes);
1975     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1976     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1977     __ increment(qword_count);
1978     __ jcc(Assembler::notZero, L_copy_8_bytes);
1979 
1980     // Check for and copy trailing dword
1981   __ BIND(L_copy_4_bytes);
1982     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1983     __ jccb(Assembler::zero, L_exit);
1984     __ movl(rax, Address(end_from, 8));
1985     __ movl(Address(end_to, 8), rax);
1986   }
1987 __ BIND(L_exit);
1988   address ucme_exit_pc = __ pc();
1989   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1990   restore_arg_regs_using_thread();
1991   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1992   __ vzeroupper();
1993   __ xorptr(rax, rax); // return 0
1994   __ leave(); // required for proper stackwalking of RuntimeStub frame
1995   __ ret(0);
1996 
1997   {
1998     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
1999     // Copy in multi-bytes chunks
2000     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2001     __ jmp(L_copy_4_bytes);
2002   }
2003 
2004   return start;
2005 }
2006 
2007 
2008 // Arguments:
2009 //   entry     - location for return of (post-push) entry
2010 //   nooverlap_target - entry to branch to if no overlap detected
2011 //   is_oop  - true => oop array, so generate store check code
2012 //
2013 // Inputs:
2014 //   c_rarg0   - source array address
2015 //   c_rarg1   - destination array address
2016 //   c_rarg2   - element count, treated as ssize_t, can be zero
2017 //
2018 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2019 // the hardware handle it.  The two dwords within qwords that span
2020 // cache line boundaries will still be loaded and stored atomically.
2021 //
2022 address StubGenerator::generate_conjoint_int_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2023   // aligned is always false -- x86_64 always uses the unaligned code
2024   const bool aligned = false;
2025   bool is_oop;
2026   bool dest_uninitialized;
2027   switch (stub_id) {
2028   case StubId::stubgen_jint_arraycopy_id:
2029     is_oop = false;
2030     dest_uninitialized = false;
2031     break;
2032   case StubId::stubgen_oop_arraycopy_id:
2033     assert(UseCompressedOops, "inconsistent oop copy size!");
2034     is_oop = true;
2035     dest_uninitialized = false;
2036     break;
2037   case StubId::stubgen_oop_arraycopy_uninit_id:
2038     assert(UseCompressedOops, "inconsistent oop copy size!");
2039     is_oop = true;
2040     dest_uninitialized = true;
2041     break;
2042   default:
2043     ShouldNotReachHere();
2044   }
2045 
2046   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2047 #if COMPILER2_OR_JVMCI
2048   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2049     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2050   }
2051 #endif
2052 
2053   __ align(CodeEntryAlignment);
2054   StubCodeMark mark(this, stub_id);
2055   address start = __ pc();
2056 
2057   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2058   const Register from        = rdi;  // source array address
2059   const Register to          = rsi;  // destination array address
2060   const Register count       = rdx;  // elements count
2061   const Register dword_count = rcx;
2062   const Register qword_count = count;
2063 
2064   __ enter(); // required for proper stackwalking of RuntimeStub frame
2065   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2066 
2067   if (entry != nullptr) {
2068     *entry = __ pc();
2069      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2070     BLOCK_COMMENT("Entry:");
2071   }
2072 
2073   array_overlap_test(nooverlap_target, Address::times_4);
2074   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2075                                  // r9 is used to save r15_thread
2076 
2077   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2078   if (dest_uninitialized) {
2079     decorators |= IS_DEST_UNINITIALIZED;
2080   }
2081   if (aligned) {
2082     decorators |= ARRAYCOPY_ALIGNED;
2083   }
2084 
2085   BasicType type = is_oop ? T_OBJECT : T_INT;
2086   // no registers are destroyed by this call
2087   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2088 
2089   assert_clean_int(count, rax); // Make sure 'count' is clean int.
2090   {
2091     // UnsafeMemoryAccess page error: continue after unsafe access
2092     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2093     // 'from', 'to' and 'count' are now valid
2094     __ movptr(dword_count, count);
2095     __ shrptr(count, 1); // count => qword_count
2096 
2097     // Copy from high to low addresses.  Use 'to' as scratch.
2098 
2099     // Check for and copy trailing dword
2100     __ testl(dword_count, 1);
2101     __ jcc(Assembler::zero, L_copy_bytes);
2102     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2103     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2104     __ jmp(L_copy_bytes);
2105 
2106     // Copy trailing qwords
2107   __ BIND(L_copy_8_bytes);
2108     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2109     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2110     __ decrement(qword_count);
2111     __ jcc(Assembler::notZero, L_copy_8_bytes);
2112   }
2113   if (is_oop) {
2114     __ jmp(L_exit);
2115   }
2116   restore_arg_regs_using_thread();
2117   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2118   __ xorptr(rax, rax); // return 0
2119   __ vzeroupper();
2120   __ leave(); // required for proper stackwalking of RuntimeStub frame
2121   __ ret(0);
2122 
2123   {
2124     // UnsafeMemoryAccess page error: continue after unsafe access
2125     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2126     // Copy in multi-bytes chunks
2127     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2128   }
2129 
2130 __ BIND(L_exit);
2131   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2132   restore_arg_regs_using_thread();
2133   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2134   __ xorptr(rax, rax); // return 0
2135   __ vzeroupper();
2136   __ leave(); // required for proper stackwalking of RuntimeStub frame
2137   __ ret(0);
2138 
2139   return start;
2140 }
2141 
2142 
2143 // Arguments:
2144 //   entry     - location for return of (post-push) entry
2145 //
2146 // Inputs:
2147 //   c_rarg0   - source array address
2148 //   c_rarg1   - destination array address
2149 //   c_rarg2   - element count, treated as ssize_t, can be zero
2150 //
2151  // Side Effects:
2152 //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2153 //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2154 //
2155 address StubGenerator::generate_disjoint_long_oop_copy(StubId stub_id, address *entry) {
2156   // aligned is always false -- x86_64 always uses the unaligned code
2157   const bool aligned = false;
2158   bool is_oop;
2159   bool dest_uninitialized;
2160   switch (stub_id) {
2161   case StubId::stubgen_jlong_disjoint_arraycopy_id:
2162     is_oop = false;
2163     dest_uninitialized = false;
2164     break;
2165   case StubId::stubgen_oop_disjoint_arraycopy_id:
2166     assert(!UseCompressedOops, "inconsistent oop copy size!");
2167     is_oop = true;
2168     dest_uninitialized = false;
2169     break;
2170   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2171     assert(!UseCompressedOops, "inconsistent oop copy size!");
2172     is_oop = true;
2173     dest_uninitialized = true;
2174     break;
2175   default:
2176     ShouldNotReachHere();
2177   }
2178 
2179   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2180 #if COMPILER2_OR_JVMCI
2181   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2182     return generate_disjoint_copy_avx3_masked(stub_id, entry);
2183   }
2184 #endif
2185 
2186   __ align(CodeEntryAlignment);
2187   StubCodeMark mark(this, stub_id);
2188   address start = __ pc();
2189 
2190   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2191   const Register from        = rdi;  // source array address
2192   const Register to          = rsi;  // destination array address
2193   const Register qword_count = rdx;  // elements count
2194   const Register end_from    = from; // source array end address
2195   const Register end_to      = rcx;  // destination array end address
2196   const Register saved_count = r11;
2197   // End pointers are inclusive, and if count is not zero they point
2198   // to the last unit copied:  end_to[0] := end_from[0]
2199 
2200   __ enter(); // required for proper stackwalking of RuntimeStub frame
2201   // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2202   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2203 
2204   if (entry != nullptr) {
2205     *entry = __ pc();
2206     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2207     BLOCK_COMMENT("Entry:");
2208   }
2209 
2210   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2211                                    // r9 is used to save r15_thread
2212   // 'from', 'to' and 'qword_count' are now valid
2213 
2214   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2215   if (dest_uninitialized) {
2216     decorators |= IS_DEST_UNINITIALIZED;
2217   }
2218   if (aligned) {
2219     decorators |= ARRAYCOPY_ALIGNED;
2220   }
2221 
2222   BasicType type = is_oop ? T_OBJECT : T_LONG;
2223   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2224   {
2225     // UnsafeMemoryAccess page error: continue after unsafe access
2226     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2227 
2228     // Copy from low to high addresses.  Use 'to' as scratch.
2229     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2230     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2231     __ negptr(qword_count);
2232     __ jmp(L_copy_bytes);
2233 
2234     // Copy trailing qwords
2235   __ BIND(L_copy_8_bytes);
2236     bs->copy_load_at(_masm, decorators, type, 8,
2237                      rax, Address(end_from, qword_count, Address::times_8, 8),
2238                      r10);
2239     bs->copy_store_at(_masm, decorators, type, 8,
2240                       Address(end_to, qword_count, Address::times_8, 8), rax,
2241                       r10);
2242     __ increment(qword_count);
2243     __ jcc(Assembler::notZero, L_copy_8_bytes);
2244   }
2245   if (is_oop) {
2246     __ jmp(L_exit);
2247   } else {
2248     restore_arg_regs_using_thread();
2249     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2250     __ xorptr(rax, rax); // return 0
2251     __ vzeroupper();
2252     __ leave(); // required for proper stackwalking of RuntimeStub frame
2253     __ ret(0);
2254   }
2255 
2256   {
2257     // UnsafeMemoryAccess page error: continue after unsafe access
2258     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2259     // Copy in multi-bytes chunks
2260     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2261   }
2262 
2263   __ BIND(L_exit);
2264   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2265   restore_arg_regs_using_thread();
2266   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2267                           SharedRuntime::_jlong_array_copy_ctr,
2268                  rscratch1); // Update counter after rscratch1 is free
2269   __ vzeroupper();
2270   __ xorptr(rax, rax); // return 0
2271   __ leave(); // required for proper stackwalking of RuntimeStub frame
2272   __ ret(0);
2273 
2274   return start;
2275 }
2276 
2277 
2278 // Arguments:
2279 //   entry     - location for return of (post-push) entry
2280 //   nooverlap_target - entry to branch to if no overlap detected
2281 //   is_oop  - true => oop array, so generate store check code
2282 //
2283 // Inputs:
2284 //   c_rarg0   - source array address
2285 //   c_rarg1   - destination array address
2286 //   c_rarg2   - element count, treated as ssize_t, can be zero
2287 //
2288 address StubGenerator::generate_conjoint_long_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2289   // aligned is always false -- x86_64 always uses the unaligned code
2290   const bool aligned = false;
2291   bool is_oop;
2292   bool dest_uninitialized;
2293   switch (stub_id) {
2294   case StubId::stubgen_jlong_arraycopy_id:
2295     is_oop = false;
2296     dest_uninitialized = false;
2297     break;
2298   case StubId::stubgen_oop_arraycopy_id:
2299     assert(!UseCompressedOops, "inconsistent oop copy size!");
2300     is_oop = true;
2301     dest_uninitialized = false;
2302     break;
2303   case StubId::stubgen_oop_arraycopy_uninit_id:
2304     assert(!UseCompressedOops, "inconsistent oop copy size!");
2305     is_oop = true;
2306     dest_uninitialized = true;
2307     break;
2308   default:
2309     ShouldNotReachHere();
2310   }
2311 
2312   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2313 #if COMPILER2_OR_JVMCI
2314   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2315     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2316   }
2317 #endif
2318 
2319   __ align(CodeEntryAlignment);
2320   StubCodeMark mark(this, stub_id);
2321   address start = __ pc();
2322 
2323   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2324   const Register from        = rdi;  // source array address
2325   const Register to          = rsi;  // destination array address
2326   const Register qword_count = rdx;  // elements count
2327   const Register saved_count = rcx;
2328 
2329   __ enter(); // required for proper stackwalking of RuntimeStub frame
2330   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2331 
2332   if (entry != nullptr) {
2333     *entry = __ pc();
2334     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2335     BLOCK_COMMENT("Entry:");
2336   }
2337 
2338   array_overlap_test(nooverlap_target, Address::times_8);
2339   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2340                                  // r9 is used to save r15_thread
2341   // 'from', 'to' and 'qword_count' are now valid
2342 
2343   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2344   if (dest_uninitialized) {
2345     decorators |= IS_DEST_UNINITIALIZED;
2346   }
2347   if (aligned) {
2348     decorators |= ARRAYCOPY_ALIGNED;
2349   }
2350 
2351   BasicType type = is_oop ? T_OBJECT : T_LONG;
2352   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2353   {
2354     // UnsafeMemoryAccess page error: continue after unsafe access
2355     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2356 
2357     __ jmp(L_copy_bytes);
2358 
2359     // Copy trailing qwords
2360   __ BIND(L_copy_8_bytes);
2361     bs->copy_load_at(_masm, decorators, type, 8,
2362                      rax, Address(from, qword_count, Address::times_8, -8),
2363                      r10);
2364     bs->copy_store_at(_masm, decorators, type, 8,
2365                       Address(to, qword_count, Address::times_8, -8), rax,
2366                       r10);
2367     __ decrement(qword_count);
2368     __ jcc(Assembler::notZero, L_copy_8_bytes);
2369   }
2370   if (is_oop) {
2371     __ jmp(L_exit);
2372   } else {
2373     restore_arg_regs_using_thread();
2374     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2375     __ xorptr(rax, rax); // return 0
2376     __ vzeroupper();
2377     __ leave(); // required for proper stackwalking of RuntimeStub frame
2378     __ ret(0);
2379   }
2380   {
2381     // UnsafeMemoryAccess page error: continue after unsafe access
2382     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2383 
2384     // Copy in multi-bytes chunks
2385     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2386   }
2387   __ BIND(L_exit);
2388   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2389   restore_arg_regs_using_thread();
2390   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2391                           SharedRuntime::_jlong_array_copy_ctr,
2392                  rscratch1); // Update counter after rscratch1 is free
2393   __ vzeroupper();
2394   __ xorptr(rax, rax); // return 0
2395   __ leave(); // required for proper stackwalking of RuntimeStub frame
2396   __ ret(0);
2397 
2398   return start;
2399 }
2400 
2401 
2402 // Helper for generating a dynamic type check.
2403 // Smashes no registers.
2404 void StubGenerator::generate_type_check(Register sub_klass,
2405                                         Register super_check_offset,
2406                                         Register super_klass,
2407                                         Label& L_success) {
2408   assert_different_registers(sub_klass, super_check_offset, super_klass);
2409 
2410   BLOCK_COMMENT("type_check:");
2411 
2412   Label L_miss;
2413 
2414   __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
2415                                    super_check_offset);
2416   __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2417 
2418   // Fall through on failure!
2419   __ BIND(L_miss);
2420 }
2421 
2422 //
2423 //  Generate checkcasting array copy stub
2424 //
2425 //  Input:
2426 //    c_rarg0   - source array address
2427 //    c_rarg1   - destination array address
2428 //    c_rarg2   - element count, treated as ssize_t, can be zero
2429 //    c_rarg3   - size_t ckoff (super_check_offset)
2430 // not Win64
2431 //    c_rarg4   - oop ckval (super_klass)
2432 // Win64
2433 //    rsp+40    - oop ckval (super_klass)
2434 //
2435 //  Output:
2436 //    rax ==  0  -  success
2437 //    rax == -1^K - failure, where K is partial transfer count
2438 //
2439 address StubGenerator::generate_checkcast_copy(StubId stub_id, address *entry) {
2440 
2441   bool dest_uninitialized;
2442   switch (stub_id) {
2443   case StubId::stubgen_checkcast_arraycopy_id:
2444     dest_uninitialized = false;
2445     break;
2446   case StubId::stubgen_checkcast_arraycopy_uninit_id:
2447     dest_uninitialized = true;
2448     break;
2449   default:
2450     ShouldNotReachHere();
2451   }
2452 
2453   Label L_load_element, L_store_element, L_do_card_marks, L_done;
2454 
2455   // Input registers (after setup_arg_regs)
2456   const Register from        = rdi;   // source array address
2457   const Register to          = rsi;   // destination array address
2458   const Register length      = rdx;   // elements count
2459   const Register ckoff       = rcx;   // super_check_offset
2460   const Register ckval       = r8;    // super_klass
2461 
2462   // Registers used as temps (r13, r14 are save-on-entry)
2463   const Register end_from    = from;  // source array end address
2464   const Register end_to      = r13;   // destination array end address
2465   const Register count       = rdx;   // -(count_remaining)
2466   const Register r14_length  = r14;   // saved copy of length
2467   // End pointers are inclusive, and if length is not zero they point
2468   // to the last unit copied:  end_to[0] := end_from[0]
2469 
2470   const Register rax_oop    = rax;    // actual oop copied
2471   const Register r11_klass  = r11;    // oop._klass
2472 
2473   //---------------------------------------------------------------
2474   // Assembler stub will be used for this call to arraycopy
2475   // if the two arrays are subtypes of Object[] but the
2476   // destination array type is not equal to or a supertype
2477   // of the source type.  Each element must be separately
2478   // checked.
2479 
2480   __ align(CodeEntryAlignment);
2481   StubCodeMark mark(this, stub_id);
2482   address start = __ pc();
2483 
2484   __ enter(); // required for proper stackwalking of RuntimeStub frame
2485 
2486 #ifdef ASSERT
2487   // caller guarantees that the arrays really are different
2488   // otherwise, we would have to make conjoint checks
2489   { Label L;
2490     array_overlap_test(L, TIMES_OOP);
2491     __ stop("checkcast_copy within a single array");
2492     __ bind(L);
2493   }
2494 #endif //ASSERT
2495 
2496   setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2497                                   // ckoff => rcx, ckval => r8
2498                                   // r9 is used to save r15_thread
2499 #ifdef _WIN64
2500   // last argument (#4) is on stack on Win64
2501   __ movptr(ckval, Address(rsp, 6 * wordSize));
2502 #endif
2503 
2504   // Caller of this entry point must set up the argument registers.
2505   if (entry != nullptr) {
2506     *entry = __ pc();
2507     BLOCK_COMMENT("Entry:");
2508   }
2509 
2510   // allocate spill slots for r13, r14
2511   enum {
2512     saved_r13_offset,
2513     saved_r14_offset,
2514     saved_r10_offset,
2515     saved_rbp_offset
2516   };
2517   __ subptr(rsp, saved_rbp_offset * wordSize);
2518   __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2519   __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2520   __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2521 
2522 #ifdef ASSERT
2523     Label L2;
2524     __ get_thread_slow(r14);
2525     __ cmpptr(r15_thread, r14);
2526     __ jcc(Assembler::equal, L2);
2527     __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2528     __ bind(L2);
2529 #endif // ASSERT
2530 
2531   // check that int operands are properly extended to size_t
2532   assert_clean_int(length, rax);
2533   assert_clean_int(ckoff, rax);
2534 
2535 #ifdef ASSERT
2536   BLOCK_COMMENT("assert consistent ckoff/ckval");
2537   // The ckoff and ckval must be mutually consistent,
2538   // even though caller generates both.
2539   { Label L;
2540     int sco_offset = in_bytes(Klass::super_check_offset_offset());
2541     __ cmpl(ckoff, Address(ckval, sco_offset));
2542     __ jcc(Assembler::equal, L);
2543     __ stop("super_check_offset inconsistent");
2544     __ bind(L);
2545   }
2546 #endif //ASSERT
2547 
2548   // Loop-invariant addresses.  They are exclusive end pointers.
2549   Address end_from_addr(from, length, TIMES_OOP, 0);
2550   Address   end_to_addr(to,   length, TIMES_OOP, 0);
2551   // Loop-variant addresses.  They assume post-incremented count < 0.
2552   Address from_element_addr(end_from, count, TIMES_OOP, 0);
2553   Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2554 
2555   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2556   if (dest_uninitialized) {
2557     decorators |= IS_DEST_UNINITIALIZED;
2558   }
2559 
2560   BasicType type = T_OBJECT;
2561   size_t element_size = UseCompressedOops ? 4 : 8;
2562 
2563   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2564   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2565 
2566   // Copy from low to high addresses, indexed from the end of each array.
2567   __ lea(end_from, end_from_addr);
2568   __ lea(end_to,   end_to_addr);
2569   __ movptr(r14_length, length);        // save a copy of the length
2570   assert(length == count, "");          // else fix next line:
2571   __ negptr(count);                     // negate and test the length
2572   __ jcc(Assembler::notZero, L_load_element);
2573 
2574   // Empty array:  Nothing to do.
2575   __ xorptr(rax, rax);                  // return 0 on (trivial) success
2576   __ jmp(L_done);
2577 
2578   // ======== begin loop ========
2579   // (Loop is rotated; its entry is L_load_element.)
2580   // Loop control:
2581   //   for (count = -count; count != 0; count++)
2582   // Base pointers src, dst are biased by 8*(count-1),to last element.
2583   __ align(OptoLoopAlignment);
2584 
2585   __ BIND(L_store_element);
2586   bs->copy_store_at(_masm,
2587                     decorators,
2588                     type,
2589                     element_size,
2590                     to_element_addr,
2591                     rax_oop,
2592                     r10);
2593   __ increment(count);               // increment the count toward zero
2594   __ jcc(Assembler::zero, L_do_card_marks);
2595 
2596   // ======== loop entry is here ========
2597   __ BIND(L_load_element);
2598   bs->copy_load_at(_masm,
2599                    decorators,
2600                    type,
2601                    element_size,
2602                    rax_oop,
2603                    from_element_addr,
2604                    r10);
2605   __ testptr(rax_oop, rax_oop);
2606   __ jcc(Assembler::zero, L_store_element);
2607 
2608   __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2609   generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2610   // ======== end loop ========
2611 
2612   // It was a real error; we must depend on the caller to finish the job.
2613   // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2614   // Emit GC store barriers for the oops we have copied (r14 + rdx),
2615   // and report their number to the caller.
2616   assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2617   Label L_post_barrier;
2618   __ addptr(r14_length, count);     // K = (original - remaining) oops
2619   __ movptr(rax, r14_length);       // save the value
2620   __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2621   __ jccb(Assembler::notZero, L_post_barrier);
2622   __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2623 
2624   // Come here on success only.
2625   __ BIND(L_do_card_marks);
2626   __ xorptr(rax, rax);              // return 0 on success
2627 
2628   __ BIND(L_post_barrier);
2629   bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2630 
2631   // Common exit point (success or failure).
2632   __ BIND(L_done);
2633   __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2634   __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2635   __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2636   restore_arg_regs_using_thread();
2637   INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2638   __ leave(); // required for proper stackwalking of RuntimeStub frame
2639   __ ret(0);
2640 
2641   return start;
2642 }
2643 
2644 
2645 //  Generate 'unsafe' array copy stub
2646 //  Though just as safe as the other stubs, it takes an unscaled
2647 //  size_t argument instead of an element count.
2648 //
2649 //  Input:
2650 //    c_rarg0   - source array address
2651 //    c_rarg1   - destination array address
2652 //    c_rarg2   - byte count, treated as ssize_t, can be zero
2653 //
2654 // Examines the alignment of the operands and dispatches
2655 // to a long, int, short, or byte copy loop.
2656 //
2657 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
2658                                             address int_copy_entry, address long_copy_entry) {
2659 
2660   Label L_long_aligned, L_int_aligned, L_short_aligned;
2661 
2662   // Input registers (before setup_arg_regs)
2663   const Register from        = c_rarg0;  // source array address
2664   const Register to          = c_rarg1;  // destination array address
2665   const Register size        = c_rarg2;  // byte count (size_t)
2666 
2667   // Register used as a temp
2668   const Register bits        = rax;      // test copy of low bits
2669 
2670   __ align(CodeEntryAlignment);
2671   StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2672   StubCodeMark mark(this, stub_id);
2673   address start = __ pc();
2674 
2675   __ enter(); // required for proper stackwalking of RuntimeStub frame
2676 
2677   // bump this on entry, not on exit:
2678   INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
2679 
2680   __ mov(bits, from);
2681   __ orptr(bits, to);
2682   __ orptr(bits, size);
2683 
2684   __ testb(bits, BytesPerLong-1);
2685   __ jccb(Assembler::zero, L_long_aligned);
2686 
2687   __ testb(bits, BytesPerInt-1);
2688   __ jccb(Assembler::zero, L_int_aligned);
2689 
2690   __ testb(bits, BytesPerShort-1);
2691   __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2692 
2693   __ BIND(L_short_aligned);
2694   __ shrptr(size, LogBytesPerShort); // size => short_count
2695   __ jump(RuntimeAddress(short_copy_entry));
2696 
2697   __ BIND(L_int_aligned);
2698   __ shrptr(size, LogBytesPerInt); // size => int_count
2699   __ jump(RuntimeAddress(int_copy_entry));
2700 
2701   __ BIND(L_long_aligned);
2702   __ shrptr(size, LogBytesPerLong); // size => qword_count
2703   __ jump(RuntimeAddress(long_copy_entry));
2704 
2705   return start;
2706 }
2707 
2708 
2709 // Static enum for helper
2710 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
2711 // Helper for generate_unsafe_setmemory
2712 //
2713 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
2714 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
2715                                      Register size, Register wide_value,
2716                                      Register tmp, Label& L_exit,
2717                                      MacroAssembler *_masm) {
2718   Label L_Loop, L_Tail, L_TailLoop;
2719 
2720   int shiftval = 0;
2721   int incr = 0;
2722 
2723   switch (type) {
2724     case USM_SHORT:
2725       shiftval = 1;
2726       incr = 16;
2727       break;
2728     case USM_DWORD:
2729       shiftval = 2;
2730       incr = 32;
2731       break;
2732     case USM_QUADWORD:
2733       shiftval = 3;
2734       incr = 64;
2735       break;
2736   }
2737 
2738   // At this point, we know the lower bits of size are zero
2739   __ shrq(size, shiftval);
2740   // size now has number of X-byte chunks (2, 4 or 8)
2741 
2742   // Number of (8*X)-byte chunks into tmp
2743   __ movq(tmp, size);
2744   __ shrq(tmp, 3);
2745   __ jccb(Assembler::zero, L_Tail);
2746 
2747   __ BIND(L_Loop);
2748 
2749   // Unroll 8 stores
2750   for (int i = 0; i < 8; i++) {
2751     switch (type) {
2752       case USM_SHORT:
2753         __ movw(Address(dest, (2 * i)), wide_value);
2754         break;
2755       case USM_DWORD:
2756         __ movl(Address(dest, (4 * i)), wide_value);
2757         break;
2758       case USM_QUADWORD:
2759         __ movq(Address(dest, (8 * i)), wide_value);
2760         break;
2761     }
2762   }
2763   __ addq(dest, incr);
2764   __ decrementq(tmp);
2765   __ jccb(Assembler::notZero, L_Loop);
2766 
2767   __ BIND(L_Tail);
2768 
2769   // Find number of remaining X-byte chunks
2770   __ andq(size, 0x7);
2771 
2772   // If zero, then we're done
2773   __ jccb(Assembler::zero, L_exit);
2774 
2775   __ BIND(L_TailLoop);
2776 
2777     switch (type) {
2778       case USM_SHORT:
2779         __ movw(Address(dest, 0), wide_value);
2780         break;
2781       case USM_DWORD:
2782         __ movl(Address(dest, 0), wide_value);
2783         break;
2784       case USM_QUADWORD:
2785         __ movq(Address(dest, 0), wide_value);
2786         break;
2787     }
2788   __ addq(dest, incr >> 3);
2789   __ decrementq(size);
2790   __ jccb(Assembler::notZero, L_TailLoop);
2791 }
2792 
2793 //  Generate 'unsafe' set memory stub
2794 //  Though just as safe as the other stubs, it takes an unscaled
2795 //  size_t (# bytes) argument instead of an element count.
2796 //
2797 //  Input:
2798 //    c_rarg0   - destination array address
2799 //    c_rarg1   - byte count (size_t)
2800 //    c_rarg2   - byte value
2801 //
2802 // Examines the alignment of the operands and dispatches
2803 // to an int, short, or byte fill loop.
2804 //
2805 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) {
2806   __ align(CodeEntryAlignment);
2807   StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
2808   StubCodeMark mark(this, stub_id);
2809   address start = __ pc();
2810   __ enter();   // required for proper stackwalking of RuntimeStub frame
2811 
2812   assert(unsafe_byte_fill != nullptr, "Invalid call");
2813 
2814   // bump this on entry, not on exit:
2815   INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
2816 
2817   {
2818     Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
2819 
2820     const Register dest = c_rarg0;
2821     const Register size = c_rarg1;
2822     const Register byteVal = c_rarg2;
2823     const Register wide_value = rax;
2824     const Register rScratch1 = r10;
2825 
2826     assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
2827 
2828     //     fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2829 
2830     __ testq(size, size);
2831     __ jcc(Assembler::zero, L_exit);
2832 
2833     // Propagate byte to full Register
2834     __ movzbl(rScratch1, byteVal);
2835     __ mov64(wide_value, 0x0101010101010101ULL);
2836     __ imulq(wide_value, rScratch1);
2837 
2838     // Check for pointer & size alignment
2839     __ movq(rScratch1, dest);
2840     __ orq(rScratch1, size);
2841 
2842     __ testb(rScratch1, 7);
2843     __ jcc(Assembler::equal, L_fillQuadwords);
2844 
2845     __ testb(rScratch1, 3);
2846     __ jcc(Assembler::equal, L_fillDwords);
2847 
2848     __ testb(rScratch1, 1);
2849     __ jcc(Assembler::notEqual, L_fillBytes);
2850 
2851     // Fill words
2852     {
2853       UnsafeMemoryAccessMark umam(this, true, true);
2854 
2855       // At this point, we know the lower bit of size is zero and a
2856       // multiple of 2
2857       do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
2858                                L_exit, _masm);
2859     }
2860     __ jmpb(L_exit);
2861 
2862     __ BIND(L_fillQuadwords);
2863 
2864     // Fill QUADWORDs
2865     {
2866       UnsafeMemoryAccessMark umam(this, true, true);
2867 
2868       // At this point, we know the lower 3 bits of size are zero and a
2869       // multiple of 8
2870       do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
2871                                L_exit, _masm);
2872     }
2873     __ BIND(L_exit);
2874 
2875     __ leave();   // required for proper stackwalking of RuntimeStub frame
2876     __ ret(0);
2877 
2878     __ BIND(L_fillDwords);
2879 
2880     // Fill DWORDs
2881     {
2882       UnsafeMemoryAccessMark umam(this, true, true);
2883 
2884       // At this point, we know the lower 2 bits of size are zero and a
2885       // multiple of 4
2886       do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
2887                                L_exit, _masm);
2888     }
2889     __ jmpb(L_exit);
2890 
2891     __ BIND(L_fillBytes);
2892     // Set up for tail call to previously generated byte fill routine
2893     // Parameter order is (ptr, byteVal, size)
2894     __ xchgq(c_rarg1, c_rarg2);
2895     __ leave();    // Clear effect of enter()
2896     __ jump(RuntimeAddress(unsafe_byte_fill));
2897   }
2898 
2899   return start;
2900 }
2901 
2902 // Perform range checks on the proposed arraycopy.
2903 // Kills temp, but nothing else.
2904 // Also, clean the sign bits of src_pos and dst_pos.
2905 void StubGenerator::arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2906                                            Register src_pos, // source position (c_rarg1)
2907                                            Register dst,     // destination array oo (c_rarg2)
2908                                            Register dst_pos, // destination position (c_rarg3)
2909                                            Register length,
2910                                            Register temp,
2911                                            Label& L_failed) {
2912   BLOCK_COMMENT("arraycopy_range_checks:");
2913 
2914   //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2915   __ movl(temp, length);
2916   __ addl(temp, src_pos);             // src_pos + length
2917   __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2918   __ jcc(Assembler::above, L_failed);
2919 
2920   //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2921   __ movl(temp, length);
2922   __ addl(temp, dst_pos);             // dst_pos + length
2923   __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2924   __ jcc(Assembler::above, L_failed);
2925 
2926   // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2927   // Move with sign extension can be used since they are positive.
2928   __ movslq(src_pos, src_pos);
2929   __ movslq(dst_pos, dst_pos);
2930 
2931   BLOCK_COMMENT("arraycopy_range_checks done");
2932 }
2933 
2934 
2935 //  Generate generic array copy stubs
2936 //
2937 //  Input:
2938 //    c_rarg0    -  src oop
2939 //    c_rarg1    -  src_pos (32-bits)
2940 //    c_rarg2    -  dst oop
2941 //    c_rarg3    -  dst_pos (32-bits)
2942 // not Win64
2943 //    c_rarg4    -  element count (32-bits)
2944 // Win64
2945 //    rsp+40     -  element count (32-bits)
2946 //
2947 //  Output:
2948 //    rax ==  0  -  success
2949 //    rax == -1^K - failure, where K is partial transfer count
2950 //
2951 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2952                                              address int_copy_entry, address oop_copy_entry,
2953                                              address long_copy_entry, address checkcast_copy_entry) {
2954 
2955   Label L_failed, L_failed_0, L_objArray;
2956   Label L_copy_shorts, L_copy_ints, L_copy_longs;
2957 
2958   // Input registers
2959   const Register src        = c_rarg0;  // source array oop
2960   const Register src_pos    = c_rarg1;  // source position
2961   const Register dst        = c_rarg2;  // destination array oop
2962   const Register dst_pos    = c_rarg3;  // destination position
2963 #ifndef _WIN64
2964   const Register length     = c_rarg4;
2965   const Register rklass_tmp = r9;  // load_klass
2966 #else
2967   const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
2968   const Register rklass_tmp = rdi;  // load_klass
2969 #endif
2970 
2971   { int modulus = CodeEntryAlignment;
2972     int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2973     int advance = target - (__ offset() % modulus);
2974     if (advance < 0)  advance += modulus;
2975     if (advance > 0)  __ nop(advance);
2976   }
2977   StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2978   StubCodeMark mark(this, stub_id);
2979 
2980   // Short-hop target to L_failed.  Makes for denser prologue code.
2981   __ BIND(L_failed_0);
2982   __ jmp(L_failed);
2983   assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2984 
2985   __ align(CodeEntryAlignment);
2986   address start = __ pc();
2987 
2988   __ enter(); // required for proper stackwalking of RuntimeStub frame
2989 
2990 #ifdef _WIN64
2991   __ push_ppx(rklass_tmp); // rdi is callee-save on Windows
2992 #endif
2993 
2994   // bump this on entry, not on exit:
2995   INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
2996 
2997   //-----------------------------------------------------------------------
2998   // Assembler stub will be used for this call to arraycopy
2999   // if the following conditions are met:
3000   //
3001   // (1) src and dst must not be null.
3002   // (2) src_pos must not be negative.
3003   // (3) dst_pos must not be negative.
3004   // (4) length  must not be negative.
3005   // (5) src klass and dst klass should be the same and not null.
3006   // (6) src and dst should be arrays.
3007   // (7) src_pos + length must not exceed length of src.
3008   // (8) dst_pos + length must not exceed length of dst.
3009   //
3010 
3011   //  if (src == nullptr) return -1;
3012   __ testptr(src, src);         // src oop
3013   size_t j1off = __ offset();
3014   __ jccb(Assembler::zero, L_failed_0);
3015 
3016   //  if (src_pos < 0) return -1;
3017   __ testl(src_pos, src_pos); // src_pos (32-bits)
3018   __ jccb(Assembler::negative, L_failed_0);
3019 
3020   //  if (dst == nullptr) return -1;
3021   __ testptr(dst, dst);         // dst oop
3022   __ jccb(Assembler::zero, L_failed_0);
3023 
3024   //  if (dst_pos < 0) return -1;
3025   __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3026   size_t j4off = __ offset();
3027   __ jccb(Assembler::negative, L_failed_0);
3028 
3029   // The first four tests are very dense code,
3030   // but not quite dense enough to put four
3031   // jumps in a 16-byte instruction fetch buffer.
3032   // That's good, because some branch predicters
3033   // do not like jumps so close together.
3034   // Make sure of this.
3035   guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3036 
3037   // registers used as temp
3038   const Register r11_length    = r11; // elements count to copy
3039   const Register r10_src_klass = r10; // array klass
3040 
3041   //  if (length < 0) return -1;
3042   __ movl(r11_length, length);        // length (elements count, 32-bits value)
3043   __ testl(r11_length, r11_length);
3044   __ jccb(Assembler::negative, L_failed_0);
3045 
3046   __ load_klass(r10_src_klass, src, rklass_tmp);
3047 #ifdef ASSERT
3048   //  assert(src->klass() != nullptr);
3049   {
3050     BLOCK_COMMENT("assert klasses not null {");
3051     Label L1, L2;
3052     __ testptr(r10_src_klass, r10_src_klass);
3053     __ jcc(Assembler::notZero, L2);   // it is broken if klass is null
3054     __ bind(L1);
3055     __ stop("broken null klass");
3056     __ bind(L2);
3057     __ load_klass(rax, dst, rklass_tmp);
3058     __ cmpq(rax, 0);
3059     __ jcc(Assembler::equal, L1);     // this would be broken also
3060     BLOCK_COMMENT("} assert klasses not null done");
3061   }
3062 #endif
3063 
3064   // Load layout helper (32-bits)
3065   //
3066   //  |array_tag|     | header_size | element_type |     |log2_element_size|
3067   // 32        30    24            16              8     2                 0
3068   //
3069   //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3070   //
3071 
3072   const int lh_offset = in_bytes(Klass::layout_helper_offset());
3073 
3074   // Handle objArrays completely differently...
3075   const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3076   __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3077   __ jcc(Assembler::equal, L_objArray);
3078 
3079   //  if (src->klass() != dst->klass()) return -1;
3080   __ load_klass(rax, dst, rklass_tmp);
3081   __ cmpq(r10_src_klass, rax);
3082   __ jcc(Assembler::notEqual, L_failed);
3083 
3084   const Register rax_lh = rax;  // layout helper
3085   __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3086 
3087   //  if (!src->is_Array()) return -1;
3088   __ cmpl(rax_lh, Klass::_lh_neutral_value);
3089   __ jcc(Assembler::greaterEqual, L_failed);
3090 
3091   // At this point, it is known to be a typeArray (array_tag 0x3).
3092 #ifdef ASSERT
3093   {
3094     BLOCK_COMMENT("assert primitive array {");
3095     Label L;
3096     __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3097     __ jcc(Assembler::greaterEqual, L);
3098     __ stop("must be a primitive array");
3099     __ bind(L);
3100     BLOCK_COMMENT("} assert primitive array done");
3101   }
3102 #endif
3103 
3104   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3105                          r10, L_failed);
3106 
3107   // TypeArrayKlass
3108   //
3109   // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3110   // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3111   //
3112 
3113   const Register r10_offset = r10;    // array offset
3114   const Register rax_elsize = rax_lh; // element size
3115 
3116   __ movl(r10_offset, rax_lh);
3117   __ shrl(r10_offset, Klass::_lh_header_size_shift);
3118   __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3119   __ addptr(src, r10_offset);           // src array offset
3120   __ addptr(dst, r10_offset);           // dst array offset
3121   BLOCK_COMMENT("choose copy loop based on element size");
3122   __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3123 
3124 #ifdef _WIN64
3125   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3126 #endif
3127 
3128   // next registers should be set before the jump to corresponding stub
3129   const Register from     = c_rarg0;  // source array address
3130   const Register to       = c_rarg1;  // destination array address
3131   const Register count    = c_rarg2;  // elements count
3132 
3133   // 'from', 'to', 'count' registers should be set in such order
3134   // since they are the same as 'src', 'src_pos', 'dst'.
3135 
3136   __ cmpl(rax_elsize, 0);
3137   __ jccb(Assembler::notEqual, L_copy_shorts);
3138   __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3139   __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3140   __ movl2ptr(count, r11_length); // length
3141   __ jump(RuntimeAddress(byte_copy_entry));
3142 
3143 __ BIND(L_copy_shorts);
3144   __ cmpl(rax_elsize, LogBytesPerShort);
3145   __ jccb(Assembler::notEqual, L_copy_ints);
3146   __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3147   __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3148   __ movl2ptr(count, r11_length); // length
3149   __ jump(RuntimeAddress(short_copy_entry));
3150 
3151 __ BIND(L_copy_ints);
3152   __ cmpl(rax_elsize, LogBytesPerInt);
3153   __ jccb(Assembler::notEqual, L_copy_longs);
3154   __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3155   __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3156   __ movl2ptr(count, r11_length); // length
3157   __ jump(RuntimeAddress(int_copy_entry));
3158 
3159 __ BIND(L_copy_longs);
3160 #ifdef ASSERT
3161   {
3162     BLOCK_COMMENT("assert long copy {");
3163     Label L;
3164     __ cmpl(rax_elsize, LogBytesPerLong);
3165     __ jcc(Assembler::equal, L);
3166     __ stop("must be long copy, but elsize is wrong");
3167     __ bind(L);
3168     BLOCK_COMMENT("} assert long copy done");
3169   }
3170 #endif
3171   __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3172   __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3173   __ movl2ptr(count, r11_length); // length
3174   __ jump(RuntimeAddress(long_copy_entry));
3175 
3176   // ObjArrayKlass
3177 __ BIND(L_objArray);
3178   // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3179 
3180   Label L_plain_copy, L_checkcast_copy;
3181   //  test array classes for subtyping
3182   __ load_klass(rax, dst, rklass_tmp);
3183   __ cmpq(r10_src_klass, rax); // usual case is exact equality
3184   __ jcc(Assembler::notEqual, L_checkcast_copy);
3185 
3186   // Identically typed arrays can be copied without element-wise checks.
3187   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3188                          r10, L_failed);
3189 
3190   __ lea(from, Address(src, src_pos, TIMES_OOP,
3191                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3192   __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3193                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3194   __ movl2ptr(count, r11_length); // length
3195 __ BIND(L_plain_copy);
3196 #ifdef _WIN64
3197   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3198 #endif
3199   __ jump(RuntimeAddress(oop_copy_entry));
3200 
3201 __ BIND(L_checkcast_copy);
3202   // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3203   {
3204     // Before looking at dst.length, make sure dst is also an objArray.
3205     __ cmpl(Address(rax, lh_offset), objArray_lh);
3206     __ jcc(Assembler::notEqual, L_failed);
3207 
3208     // It is safe to examine both src.length and dst.length.
3209     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3210                            rax, L_failed);
3211 
3212     const Register r11_dst_klass = r11;
3213     __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3214 
3215     // Marshal the base address arguments now, freeing registers.
3216     __ lea(from, Address(src, src_pos, TIMES_OOP,
3217                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3218     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3219                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3220     __ movl(count, length);           // length (reloaded)
3221     Register sco_temp = c_rarg3;      // this register is free now
3222     assert_different_registers(from, to, count, sco_temp,
3223                                r11_dst_klass, r10_src_klass);
3224     assert_clean_int(count, sco_temp);
3225 
3226     // Generate the type check.
3227     const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3228     __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3229     assert_clean_int(sco_temp, rax);
3230     generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3231 
3232     // Fetch destination element klass from the ObjArrayKlass header.
3233     int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3234     __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3235     __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3236     assert_clean_int(sco_temp, rax);
3237 
3238 #ifdef _WIN64
3239     __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3240 #endif
3241 
3242     // the checkcast_copy loop needs two extra arguments:
3243     assert(c_rarg3 == sco_temp, "#3 already in place");
3244     // Set up arguments for checkcast_copy_entry.
3245     setup_arg_regs_using_thread(4);
3246     __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3247     __ jump(RuntimeAddress(checkcast_copy_entry));
3248   }
3249 
3250 __ BIND(L_failed);
3251 #ifdef _WIN64
3252   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3253 #endif
3254   __ xorptr(rax, rax);
3255   __ notptr(rax); // return -1
3256   __ leave();   // required for proper stackwalking of RuntimeStub frame
3257   __ ret(0);
3258 
3259   return start;
3260 }
3261 
3262 #undef __