1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/macroAssembler.hpp"
  26 #include "gc/shared/barrierSet.hpp"
  27 #include "gc/shared/barrierSetAssembler.hpp"
  28 #include "oops/objArrayKlass.hpp"
  29 #include "runtime/sharedRuntime.hpp"
  30 #include "runtime/stubRoutines.hpp"
  31 #include "stubGenerator_x86_64.hpp"
  32 #ifdef COMPILER2
  33 #include "opto/c2_globals.hpp"
  34 #endif
  35 
  36 #define __ _masm->
  37 
  38 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #else
  43 #define BLOCK_COMMENT(str) __ block_comment(str)
  44 #endif // PRODUCT
  45 
  46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  47 
  48 #ifdef PRODUCT
  49 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
  50 #else
  51 #define INC_COUNTER_NP(counter, rscratch) \
  52 BLOCK_COMMENT("inc_counter " #counter); \
  53 inc_counter_np(_masm, counter, rscratch);
  54 
  55 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
  56   __ incrementl(ExternalAddress((address)&counter), rscratch);
  57 }
  58 
  59 #ifdef COMPILER2
  60 static uint& get_profile_ctr(int shift) {
  61   if (shift == 0) {
  62     return SharedRuntime::_jbyte_array_copy_ctr;
  63   } else if (shift == 1) {
  64     return SharedRuntime::_jshort_array_copy_ctr;
  65   } else if (shift == 2) {
  66     return SharedRuntime::_jint_array_copy_ctr;
  67   } else {
  68     assert(shift == 3, "");
  69     return SharedRuntime::_jlong_array_copy_ctr;
  70   }
  71 }
  72 #endif // COMPILER2
  73 #endif // !PRODUCT
  74 
  75 void StubGenerator::generate_arraycopy_stubs() {
  76   // Some copy stubs publish a normal entry and then a 2nd 'fallback'
  77   // entry immediately following their stack push. This can be used
  78   // as a post-push branch target for compatible stubs when they
  79   // identify a special case that can be handled by the fallback
  80   // stub e.g a disjoint copy stub may be use as a special case
  81   // fallback for its compatible conjoint copy stub.
  82   //
  83   // A no push entry is always returned in the following local and
  84   // then published by assigning to the appropriate entry field in
  85   // class StubRoutines. The entry value is then passed to the
  86   // generator for the compatible stub. That means the entry must be
  87   // listed when saving to/restoring from the AOT cache, ensuring
  88   // that the inter-stub jumps are noted at AOT-cache save and
  89   // relocated at AOT cache load.
  90   address nopush_entry;
  91 
  92   StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(&nopush_entry);
  93   // disjoint nopush entry is needed by conjoint copy
  94   StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
  95   StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
  96   // conjoint nopush entry is needed by generic/unsafe copy
  97   StubRoutines::_jbyte_arraycopy_nopush    = nopush_entry;
  98 
  99   StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
 100   // disjoint nopush entry is needed by conjoint copy
 101   StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
 102   StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 103   // conjoint nopush entry is needed by generic/unsafe copy
 104   StubRoutines::_jshort_arraycopy_nopush   = nopush_entry;
 105 
 106   StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 107   // disjoint nopush entry is needed by conjoint copy
 108   StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
 109   StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 110   // conjoint nopush entry is needed by generic/unsafe copy
 111   StubRoutines::_jint_arraycopy_nopush     = nopush_entry;
 112 
 113   StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
 114   // disjoint nopush entry is needed by conjoint copy
 115   StubRoutines::_jlong_disjoint_arraycopy_nopush  = nopush_entry;
 116   StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 117   // conjoint nopush entry is needed by generic/unsafe copy
 118   StubRoutines::_jlong_arraycopy_nopush    = nopush_entry;
 119 
 120   if (UseCompressedOops) {
 121     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
 122     // disjoint nopush entry is needed by conjoint copy
 123     StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
 124     StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
 125     // conjoint nopush entry is needed by generic/unsafe copy
 126     StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
 127     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 128     // disjoint nopush entry is needed by conjoint copy
 129     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
 130     // note that we don't need a returned nopush entry because the
 131     // generic/unsafe copy does not cater for uninit arrays.
 132     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 133   } else {
 134     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
 135     // disjoint nopush entry is needed by conjoint copy
 136     StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
 137     StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
 138     // conjoint nopush entry is needed by generic/unsafe copy
 139     StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
 140     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 141     // disjoint nopush entry is needed by conjoint copy
 142     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
 143     // note that we don't need a returned nopush entry because the
 144     // generic/unsafe copy does not cater for uninit arrays.
 145     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 146   }
 147 
 148   StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 149   // checkcast nopush entry is needed by generic copy
 150   StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 151   // note that we don't need a returned nopush entry because the
 152   // generic copy does not cater for uninit arrays.
 153   StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 154 
 155   StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 156                                                             StubRoutines::_jshort_arraycopy_nopush,
 157                                                             StubRoutines::_jint_arraycopy_nopush,
 158                                                             StubRoutines::_jlong_arraycopy_nopush);
 159   StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 160                                                              StubRoutines::_jshort_arraycopy_nopush,
 161                                                              StubRoutines::_jint_arraycopy_nopush,
 162                                                              StubRoutines::_oop_arraycopy_nopush,
 163                                                              StubRoutines::_jlong_arraycopy_nopush,
 164                                                              StubRoutines::_checkcast_arraycopy_nopush);
 165 
 166   StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 167   StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 168   StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 169   StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 170   StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 171   StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 172 
 173   StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
 174 
 175   // We don't generate specialized code for HeapWord-aligned source
 176   // arrays, so just use the code we've already generated
 177   StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
 178   StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
 179 
 180   StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
 181   StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
 182 
 183   StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
 184   StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
 185 
 186   StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
 187   StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
 188 
 189   StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
 190   StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
 191 
 192   StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
 193   StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
 194 }
 195 
 196 
 197 // Verify that a register contains clean 32-bits positive value
 198 // (high 32-bits are 0) so it could be used in 64-bits shifts.
 199 //
 200 //  Input:
 201 //    Rint  -  32-bits value
 202 //    Rtmp  -  scratch
 203 //
 204 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
 205 #ifdef ASSERT
 206   Label L;
 207   assert_different_registers(Rtmp, Rint);
 208   __ movslq(Rtmp, Rint);
 209   __ cmpq(Rtmp, Rint);
 210   __ jcc(Assembler::equal, L);
 211   __ stop("high 32-bits of int value are not 0");
 212   __ bind(L);
 213 #endif
 214 }
 215 
 216 
 217 //  Generate overlap test for array copy stubs
 218 //
 219 //  Input:
 220 //     c_rarg0 - from
 221 //     c_rarg1 - to
 222 //     c_rarg2 - element count
 223 //
 224 //  Output:
 225 //     rax   - &from[element count - 1]
 226 //
 227 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
 228   const Register from     = c_rarg0;
 229   const Register to       = c_rarg1;
 230   const Register count    = c_rarg2;
 231   const Register end_from = rax;
 232 
 233   __ cmpptr(to, from);
 234   __ lea(end_from, Address(from, count, sf, 0));
 235   if (NOLp == nullptr) {
 236     RuntimeAddress no_overlap(no_overlap_target);
 237     __ jump_cc(Assembler::belowEqual, no_overlap);
 238     __ cmpptr(to, end_from);
 239     __ jump_cc(Assembler::aboveEqual, no_overlap);
 240   } else {
 241     __ jcc(Assembler::belowEqual, (*NOLp));
 242     __ cmpptr(to, end_from);
 243     __ jcc(Assembler::aboveEqual, (*NOLp));
 244   }
 245 }
 246 
 247 
 248 // Copy big chunks forward
 249 //
 250 // Inputs:
 251 //   end_from     - source arrays end address
 252 //   end_to       - destination array end address
 253 //   qword_count  - 64-bits element count, negative
 254 //   tmp1         - scratch
 255 //   L_copy_bytes - entry label
 256 //   L_copy_8_bytes  - exit  label
 257 //
 258 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
 259                                        Register qword_count, Register tmp1,
 260                                        Register tmp2, Label& L_copy_bytes,
 261                                        Label& L_copy_8_bytes, DecoratorSet decorators,
 262                                        BasicType type) {
 263   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 264   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 265   Label L_loop;
 266   __ align(OptoLoopAlignment);
 267   if (UseUnalignedLoadStores) {
 268     Label L_end;
 269     __ BIND(L_loop);
 270     if (UseAVX >= 2) {
 271       bs->copy_load_at(_masm, decorators, type, 32,
 272                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 273                        tmp1, xmm1);
 274       bs->copy_store_at(_masm, decorators, type, 32,
 275                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 276                         tmp1, tmp2, xmm1);
 277 
 278       bs->copy_load_at(_masm, decorators, type, 32,
 279                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 280                        tmp1, xmm1);
 281       bs->copy_store_at(_masm, decorators, type, 32,
 282                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 283                         tmp1, tmp2, xmm1);
 284     } else {
 285       bs->copy_load_at(_masm, decorators, type, 16,
 286                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 287                        tmp1, xmm1);
 288       bs->copy_store_at(_masm, decorators, type, 16,
 289                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 290                         tmp1, tmp2, xmm1);
 291       bs->copy_load_at(_masm, decorators, type, 16,
 292                        xmm0, Address(end_from, qword_count, Address::times_8, -40),
 293                        tmp1, xmm1);
 294       bs->copy_store_at(_masm, decorators, type, 16,
 295                         Address(end_to, qword_count, Address::times_8, -40), xmm0,
 296                         tmp1, tmp2, xmm1);
 297       bs->copy_load_at(_masm, decorators, type, 16,
 298                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 299                        tmp1, xmm1);
 300       bs->copy_store_at(_masm, decorators, type, 16,
 301                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 302                         tmp1, tmp2, xmm1);
 303       bs->copy_load_at(_masm, decorators, type, 16,
 304                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 305                        tmp1, xmm1);
 306       bs->copy_store_at(_masm, decorators, type, 16,
 307                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 308                         tmp1, tmp2, xmm1);
 309     }
 310 
 311     __ BIND(L_copy_bytes);
 312     __ addptr(qword_count, 8);
 313     __ jcc(Assembler::lessEqual, L_loop);
 314     __ subptr(qword_count, 4);  // sub(8) and add(4)
 315     __ jcc(Assembler::greater, L_end);
 316     // Copy trailing 32 bytes
 317     if (UseAVX >= 2) {
 318       bs->copy_load_at(_masm, decorators, type, 32,
 319                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 320                        tmp1, xmm1);
 321       bs->copy_store_at(_masm, decorators, type, 32,
 322                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 323                         tmp1, tmp2, xmm1);
 324     } else {
 325       bs->copy_load_at(_masm, decorators, type, 16,
 326                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 327                        tmp1, xmm1);
 328       bs->copy_store_at(_masm, decorators, type, 16,
 329                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 330                         tmp1, tmp2, xmm1);
 331       bs->copy_load_at(_masm, decorators, type, 16,
 332                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 333                        tmp1, xmm1);
 334       bs->copy_store_at(_masm, decorators, type, 16,
 335                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 336                         tmp1, tmp2, xmm1);
 337     }
 338     __ addptr(qword_count, 4);
 339     __ BIND(L_end);
 340   } else {
 341     // Copy 32-bytes per iteration
 342     __ BIND(L_loop);
 343     bs->copy_load_at(_masm, decorators, type, 8,
 344                      tmp1, Address(end_from, qword_count, Address::times_8, -24),
 345                      tmp2);
 346     bs->copy_store_at(_masm, decorators, type, 8,
 347                       Address(end_to, qword_count, Address::times_8, -24), tmp1,
 348                       tmp2);
 349     bs->copy_load_at(_masm, decorators, type, 8,
 350                      tmp1, Address(end_from, qword_count, Address::times_8, -16),
 351                      tmp2);
 352     bs->copy_store_at(_masm, decorators, type, 8,
 353                       Address(end_to, qword_count, Address::times_8, -16), tmp1,
 354                       tmp2);
 355     bs->copy_load_at(_masm, decorators, type, 8,
 356                      tmp1, Address(end_from, qword_count, Address::times_8, -8),
 357                      tmp2);
 358     bs->copy_store_at(_masm, decorators, type, 8,
 359                       Address(end_to, qword_count, Address::times_8, -8), tmp1,
 360                       tmp2);
 361     bs->copy_load_at(_masm, decorators, type, 8,
 362                      tmp1, Address(end_from, qword_count, Address::times_8, 0),
 363                      tmp2);
 364     bs->copy_store_at(_masm, decorators, type, 8,
 365                       Address(end_to, qword_count, Address::times_8, 0), tmp1,
 366                       tmp2);
 367 
 368     __ BIND(L_copy_bytes);
 369     __ addptr(qword_count, 4);
 370     __ jcc(Assembler::lessEqual, L_loop);
 371   }
 372   __ subptr(qword_count, 4);
 373   __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
 374 }
 375 
 376 
 377 // Copy big chunks backward
 378 //
 379 // Inputs:
 380 //   from         - source arrays address
 381 //   dest         - destination array address
 382 //   qword_count  - 64-bits element count
 383 //   tmp1         - scratch
 384 //   L_copy_bytes - entry label
 385 //   L_copy_8_bytes  - exit  label
 386 //
 387 void StubGenerator::copy_bytes_backward(Register from, Register dest,
 388                                         Register qword_count, Register tmp1,
 389                                         Register tmp2, Label& L_copy_bytes,
 390                                         Label& L_copy_8_bytes, DecoratorSet decorators,
 391                                         BasicType type) {
 392   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 393   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 394   Label L_loop;
 395   __ align(OptoLoopAlignment);
 396   if (UseUnalignedLoadStores) {
 397     Label L_end;
 398     __ BIND(L_loop);
 399     if (UseAVX >= 2) {
 400       bs->copy_load_at(_masm, decorators, type, 32,
 401                        xmm0, Address(from, qword_count, Address::times_8, 32),
 402                        tmp1, xmm1);
 403       bs->copy_store_at(_masm, decorators, type, 32,
 404                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 405                         tmp1, tmp2, xmm1);
 406       bs->copy_load_at(_masm, decorators, type, 32,
 407                        xmm0, Address(from, qword_count, Address::times_8, 0),
 408                        tmp1, xmm1);
 409       bs->copy_store_at(_masm, decorators, type, 32,
 410                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 411                         tmp1, tmp2, xmm1);
 412     } else {
 413       bs->copy_load_at(_masm, decorators, type, 16,
 414                        xmm0, Address(from, qword_count, Address::times_8, 48),
 415                        tmp1, xmm1);
 416       bs->copy_store_at(_masm, decorators, type, 16,
 417                         Address(dest, qword_count, Address::times_8, 48), xmm0,
 418                         tmp1, tmp2, xmm1);
 419       bs->copy_load_at(_masm, decorators, type, 16,
 420                        xmm0, Address(from, qword_count, Address::times_8, 32),
 421                        tmp1, xmm1);
 422       bs->copy_store_at(_masm, decorators, type, 16,
 423                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 424                         tmp1, tmp2, xmm1);
 425       bs->copy_load_at(_masm, decorators, type, 16,
 426                        xmm0, Address(from, qword_count, Address::times_8, 16),
 427                        tmp1, xmm1);
 428       bs->copy_store_at(_masm, decorators, type, 16,
 429                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 430                         tmp1, tmp2, xmm1);
 431       bs->copy_load_at(_masm, decorators, type, 16,
 432                        xmm0, Address(from, qword_count, Address::times_8, 0),
 433                        tmp1, xmm1);
 434       bs->copy_store_at(_masm, decorators, type, 16,
 435                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 436                         tmp1, tmp2, xmm1);
 437     }
 438 
 439     __ BIND(L_copy_bytes);
 440     __ subptr(qword_count, 8);
 441     __ jcc(Assembler::greaterEqual, L_loop);
 442 
 443     __ addptr(qword_count, 4);  // add(8) and sub(4)
 444     __ jcc(Assembler::less, L_end);
 445     // Copy trailing 32 bytes
 446     if (UseAVX >= 2) {
 447       bs->copy_load_at(_masm, decorators, type, 32,
 448                        xmm0, Address(from, qword_count, Address::times_8, 0),
 449                        tmp1, xmm1);
 450       bs->copy_store_at(_masm, decorators, type, 32,
 451                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 452                         tmp1, tmp2, xmm1);
 453     } else {
 454       bs->copy_load_at(_masm, decorators, type, 16,
 455                        xmm0, Address(from, qword_count, Address::times_8, 16),
 456                        tmp1, xmm1);
 457       bs->copy_store_at(_masm, decorators, type, 16,
 458                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 459                         tmp1, tmp2, xmm1);
 460       bs->copy_load_at(_masm, decorators, type, 16,
 461                        xmm0, Address(from, qword_count, Address::times_8, 0),
 462                        tmp1, xmm1);
 463       bs->copy_store_at(_masm, decorators, type, 16,
 464                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 465                         tmp1, tmp2, xmm1);
 466     }
 467     __ subptr(qword_count, 4);
 468     __ BIND(L_end);
 469   } else {
 470     // Copy 32-bytes per iteration
 471     __ BIND(L_loop);
 472     bs->copy_load_at(_masm, decorators, type, 8,
 473                      tmp1, Address(from, qword_count, Address::times_8, 24),
 474                      tmp2);
 475     bs->copy_store_at(_masm, decorators, type, 8,
 476                       Address(dest, qword_count, Address::times_8, 24), tmp1,
 477                       tmp2);
 478     bs->copy_load_at(_masm, decorators, type, 8,
 479                      tmp1, Address(from, qword_count, Address::times_8, 16),
 480                      tmp2);
 481     bs->copy_store_at(_masm, decorators, type, 8,
 482                       Address(dest, qword_count, Address::times_8, 16), tmp1,
 483                       tmp2);
 484     bs->copy_load_at(_masm, decorators, type, 8,
 485                      tmp1, Address(from, qword_count, Address::times_8, 8),
 486                      tmp2);
 487     bs->copy_store_at(_masm, decorators, type, 8,
 488                       Address(dest, qword_count, Address::times_8, 8), tmp1,
 489                       tmp2);
 490     bs->copy_load_at(_masm, decorators, type, 8,
 491                      tmp1, Address(from, qword_count, Address::times_8, 0),
 492                      tmp2);
 493     bs->copy_store_at(_masm, decorators, type, 8,
 494                       Address(dest, qword_count, Address::times_8, 0), tmp1,
 495                       tmp2);
 496 
 497     __ BIND(L_copy_bytes);
 498     __ subptr(qword_count, 4);
 499     __ jcc(Assembler::greaterEqual, L_loop);
 500   }
 501   __ addptr(qword_count, 4);
 502   __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
 503 }
 504 
 505 #ifdef COMPILER2
 506 
 507 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
 508 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 509 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 510 //   default configuration.
 511 // - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 512 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 513 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 514 //   better performance for disjoint copies. For conjoint/backward copy vector based
 515 //   copy performs better.
 516 // - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
 517 //   64 byte vector registers (ZMMs).
 518 
 519 // Inputs:
 520 //   c_rarg0   - source array address
 521 //   c_rarg1   - destination array address
 522 //   c_rarg2   - element count, treated as ssize_t, can be zero
 523 //
 524 //
 525 // Side Effects:
 526 //   disjoint_copy_avx3_masked is set to the no-overlap entry point
 527 //   used by generate_conjoint_[byte/int/short/long]_copy().
 528 //
 529 address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry) {
 530   // aligned is always false -- x86_64 always uses the unaligned code
 531   const bool aligned = false;
 532   int shift;
 533   bool is_oop;
 534   bool dest_uninitialized;
 535 
 536   switch (stub_id) {
 537   case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 538     shift = 0;
 539     is_oop = false;
 540     dest_uninitialized = false;
 541     break;
 542   case StubId::stubgen_jshort_disjoint_arraycopy_id:
 543     shift = 1;
 544     is_oop = false;
 545     dest_uninitialized = false;
 546     break;
 547   case StubId::stubgen_jint_disjoint_arraycopy_id:
 548     shift = 2;
 549     is_oop = false;
 550     dest_uninitialized = false;
 551     break;
 552   case StubId::stubgen_jlong_disjoint_arraycopy_id:
 553     shift = 3;
 554     is_oop = false;
 555     dest_uninitialized = false;
 556     break;
 557   case StubId::stubgen_oop_disjoint_arraycopy_id:
 558     shift = (UseCompressedOops ? 2 : 3);
 559     is_oop = true;
 560     dest_uninitialized = false;
 561     break;
 562   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 563     shift = (UseCompressedOops ? 2 : 3);
 564     is_oop = true;
 565     dest_uninitialized = true;
 566     break;
 567   default:
 568     ShouldNotReachHere();
 569   }
 570   GrowableArray<address> entries;
 571   GrowableArray<address> extras;
 572   bool add_handlers = !is_oop && !aligned;
 573   bool add_relocs = UseZGC && is_oop;
 574   bool add_extras = add_handlers || add_relocs;
 575   // The stub employs one unsafe handler region by default but has two
 576   // when MaxVectorSize == 64 So we may expect 0, 3 or 6 extras.
 577   int handlers_count = (MaxVectorSize == 64 ? 2 : 1);
 578   int expected_entry_count = (entry != nullptr ? 2 : 1);
 579   int expected_extra_count = (add_handlers ? handlers_count : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/1/2 x UMAM {start,end,handler}
 580   int entry_count = StubInfo::entry_count(stub_id);
 581   assert(entry_count == expected_entry_count, "sanity check");
 582   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
 583   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
 584   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 585   if (start != nullptr) {
 586     assert(entries.length() == expected_entry_count - 1,
 587            "unexpected entry count %d", entries.length());
 588     assert(!add_handlers || extras.length() == expected_extra_count,
 589            "unexpected handler addresses count %d", extras.length());
 590     if (entry != nullptr) {
 591       *entry = entries.at(0);
 592     }
 593     if (add_handlers) {
 594       // restore 1/2 x UMAM {start,end,handler} addresses from extras
 595       register_unsafe_access_handlers(extras, 0, handlers_count);
 596     }
 597 #if INCLUDE_ZGC
 598     // register addresses at which ZGC does colour patching
 599     if (add_relocs)  {
 600       register_reloc_addresses(extras, 0, extras.length());
 601     }
 602 #endif // INCLUDE_ZGC
 603     return start;
 604   }
 605 
 606   __ align(CodeEntryAlignment);
 607   StubCodeMark mark(this, stub_id);
 608   start = __ pc();
 609 
 610   bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
 611   const int large_threshold = 2621440; // 2.5 MB
 612   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 613   Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 614   Label L_copy_large, L_finish;
 615   const Register from        = rdi;  // source array address
 616   const Register to          = rsi;  // destination array address
 617   const Register count       = rdx;  // elements count
 618   const Register temp1       = r8;
 619   const Register temp2       = r11;
 620   const Register temp3       = rax;
 621   const Register temp4       = rcx;
 622   // End pointers are inclusive, and if count is not zero they point
 623   // to the last unit copied:  end_to[0] := end_from[0]
 624 
 625   __ enter(); // required for proper stackwalking of RuntimeStub frame
 626   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 627 
 628   if (entry != nullptr) {
 629     *entry = __ pc();
 630     entries.append(*entry);
 631      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 632     BLOCK_COMMENT("Entry:");
 633   }
 634 
 635   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 636   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 637 
 638   setup_argument_regs(type);
 639 
 640   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 641   if (dest_uninitialized) {
 642     decorators |= IS_DEST_UNINITIALIZED;
 643   }
 644   if (aligned) {
 645     decorators |= ARRAYCOPY_ALIGNED;
 646   }
 647   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 648   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 649 
 650   {
 651     // Type(shift)           byte(0), short(1), int(2),   long(3)
 652     int loop_size[]        = { 192,     96,       48,      24};
 653     int threshold[]        = { 4096,    2048,     1024,    512};
 654 
 655     // UnsafeMemoryAccess page error: continue after unsafe access
 656     UnsafeMemoryAccessMark umam(this, add_handlers, true);
 657     // 'from', 'to' and 'count' are now valid
 658 
 659     // temp1 holds remaining count and temp4 holds running count used to compute
 660     // next address offset for start of to/from addresses (temp4 * scale).
 661     __ mov64(temp4, 0);
 662     __ movq(temp1, count);
 663 
 664     // Zero length check.
 665     __ BIND(L_tail);
 666     __ cmpq(temp1, 0);
 667     __ jcc(Assembler::lessEqual, L_exit);
 668 
 669     // Special cases using 32 byte [masked] vector copy operations.
 670     arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 671                                  temp4, temp3, use64byteVector, L_entry, L_exit);
 672 
 673     // PRE-MAIN-POST loop for aligned copy.
 674     __ BIND(L_entry);
 675 
 676     if (MaxVectorSize == 64) {
 677       __ movq(temp2, temp1);
 678       __ shlq(temp2, shift);
 679       __ cmpq(temp2, large_threshold);
 680       __ jcc(Assembler::greaterEqual, L_copy_large);
 681     }
 682     if (CopyAVX3Threshold != 0) {
 683       __ cmpq(count, threshold[shift]);
 684       if (MaxVectorSize == 64) {
 685         // Copy using 64 byte vectors.
 686         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 687       } else {
 688         assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
 689         // REP MOVS offer a faster copy path.
 690         __ jcc(Assembler::greaterEqual, L_repmovs);
 691       }
 692     }
 693 
 694     if ((MaxVectorSize < 64)  || (CopyAVX3Threshold != 0)) {
 695       // Partial copy to make dst address 32 byte aligned.
 696       __ movq(temp2, to);
 697       __ andq(temp2, 31);
 698       __ jcc(Assembler::equal, L_main_pre_loop);
 699 
 700       __ negptr(temp2);
 701       __ addq(temp2, 32);
 702       if (shift) {
 703         __ shrq(temp2, shift);
 704       }
 705       __ movq(temp3, temp2);
 706       copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
 707       __ movq(temp4, temp2);
 708       __ movq(temp1, count);
 709       __ subq(temp1, temp2);
 710 
 711       __ cmpq(temp1, loop_size[shift]);
 712       __ jcc(Assembler::less, L_tail);
 713 
 714       __ BIND(L_main_pre_loop);
 715       __ subq(temp1, loop_size[shift]);
 716 
 717       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 718       __ align32();
 719       __ BIND(L_main_loop);
 720          copy64_avx(to, from, temp4, xmm1, false, shift, 0);
 721          copy64_avx(to, from, temp4, xmm1, false, shift, 64);
 722          copy64_avx(to, from, temp4, xmm1, false, shift, 128);
 723          __ addptr(temp4, loop_size[shift]);
 724          __ subq(temp1, loop_size[shift]);
 725          __ jcc(Assembler::greater, L_main_loop);
 726 
 727       __ addq(temp1, loop_size[shift]);
 728 
 729       // Tail loop.
 730       __ jmp(L_tail);
 731 
 732       __ BIND(L_repmovs);
 733         __ movq(temp2, temp1);
 734         // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
 735         __ movq(temp3, to);
 736         __ movq(to,  from);
 737         __ movq(from, temp3);
 738         // Save to/from for restoration post rep_mov.
 739         __ movq(temp1, to);
 740         __ movq(temp3, from);
 741         if(shift < 3) {
 742           __ shrq(temp2, 3-shift);     // quad word count
 743         }
 744         __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
 745         __ rep_mov();
 746         __ shlq(temp2, 3);             // convert quad words into byte count.
 747         if(shift) {
 748           __ shrq(temp2, shift);       // type specific count.
 749         }
 750         // Restore original addresses in to/from.
 751         __ movq(to, temp3);
 752         __ movq(from, temp1);
 753         __ movq(temp4, temp2);
 754         __ movq(temp1, count);
 755         __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
 756         __ jmp(L_tail);
 757     }
 758 
 759     if (MaxVectorSize > 32) {
 760       __ BIND(L_pre_main_post_64);
 761       // Partial copy to make dst address 64 byte aligned.
 762       __ movq(temp2, to);
 763       __ andq(temp2, 63);
 764       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 765 
 766       __ negptr(temp2);
 767       __ addq(temp2, 64);
 768       if (shift) {
 769         __ shrq(temp2, shift);
 770       }
 771       __ movq(temp3, temp2);
 772       copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
 773       __ movq(temp4, temp2);
 774       __ movq(temp1, count);
 775       __ subq(temp1, temp2);
 776 
 777       __ cmpq(temp1, loop_size[shift]);
 778       __ jcc(Assembler::less, L_tail64);
 779 
 780       __ BIND(L_main_pre_loop_64bytes);
 781       __ subq(temp1, loop_size[shift]);
 782 
 783       // Main loop with aligned copy block size of 192 bytes at
 784       // 64 byte copy granularity.
 785       __ align32();
 786       __ BIND(L_main_loop_64bytes);
 787          copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
 788          copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
 789          copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
 790          __ addptr(temp4, loop_size[shift]);
 791          __ subq(temp1, loop_size[shift]);
 792          __ jcc(Assembler::greater, L_main_loop_64bytes);
 793 
 794       __ addq(temp1, loop_size[shift]);
 795       // Zero length check.
 796       __ jcc(Assembler::lessEqual, L_exit);
 797 
 798       __ BIND(L_tail64);
 799 
 800       // Tail handling using 64 byte [masked] vector copy operations.
 801       use64byteVector = true;
 802       arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 803                                    temp4, temp3, use64byteVector, L_entry, L_exit);
 804     }
 805     __ BIND(L_exit);
 806   }
 807 
 808   __ BIND(L_finish);
 809   address ucme_exit_pc = __ pc();
 810   // When called from generic_arraycopy r11 contains specific values
 811   // used during arraycopy epilogue, re-initializing r11.
 812   if (is_oop) {
 813     __ movq(r11, shift == 3 ? count : to);
 814   }
 815   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 816   restore_argument_regs(type);
 817   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 818   __ xorptr(rax, rax); // return 0
 819   __ vzeroupper();
 820   __ leave(); // required for proper stackwalking of RuntimeStub frame
 821   __ ret(0);
 822 
 823   if (MaxVectorSize == 64) {
 824     __ BIND(L_copy_large);
 825       UnsafeMemoryAccessMark umam(this, add_handlers, false, ucme_exit_pc);
 826       arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
 827     __ jmp(L_finish);
 828   }
 829   // retrieve the registered handler addresses
 830   address end = __ pc();
 831   if (add_handlers) {
 832     retrieve_unsafe_access_handlers(start, end, extras);
 833   }
 834   assert(extras.length() == expected_extra_count,
 835          "unexpected handler addresses count %d", extras.length());
 836 #if INCLUDE_ZGC
 837   // retrieve addresses at which ZGC does colour patching
 838   if (add_relocs) {
 839     retrieve_reloc_addresses(start, end, extras);
 840   }
 841 #endif // INCLUDE_ZGC
 842 
 843   // record the stub entry and end plus the no_push entry and any
 844   // extra handler addresses
 845   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
 846 
 847   return start;
 848 }
 849 
 850 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
 851                                          Register temp3, Register temp4, Register count,
 852                                          XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
 853                                          XMMRegister xmm4, int shift) {
 854 
 855   // Type(shift)           byte(0), short(1), int(2),   long(3)
 856   int loop_size[]        = { 256,     128,       64,      32};
 857   int threshold[]        = { 4096,    2048,     1024,    512};
 858 
 859   Label L_main_loop_large;
 860   Label L_tail_large;
 861   Label L_exit_large;
 862   Label L_entry_large;
 863   Label L_main_pre_loop_large;
 864   Label L_pre_main_post_large;
 865 
 866   assert(MaxVectorSize == 64, "vector length != 64");
 867   __ BIND(L_entry_large);
 868 
 869   __ BIND(L_pre_main_post_large);
 870   // Partial copy to make dst address 64 byte aligned.
 871   __ movq(temp2, to);
 872   __ andq(temp2, 63);
 873   __ jcc(Assembler::equal, L_main_pre_loop_large);
 874 
 875   __ negptr(temp2);
 876   __ addq(temp2, 64);
 877   if (shift) {
 878     __ shrq(temp2, shift);
 879   }
 880   __ movq(temp3, temp2);
 881   copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
 882   __ movq(temp4, temp2);
 883   __ movq(temp1, count);
 884   __ subq(temp1, temp2);
 885 
 886   __ cmpq(temp1, loop_size[shift]);
 887   __ jcc(Assembler::less, L_tail_large);
 888 
 889   __ BIND(L_main_pre_loop_large);
 890   __ subq(temp1, loop_size[shift]);
 891 
 892   // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
 893   __ align32();
 894   __ BIND(L_main_loop_large);
 895   copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
 896   __ addptr(temp4, loop_size[shift]);
 897   __ subq(temp1, loop_size[shift]);
 898   __ jcc(Assembler::greater, L_main_loop_large);
 899   // fence needed because copy256_avx3 uses non-temporal stores
 900   __ sfence();
 901 
 902   __ addq(temp1, loop_size[shift]);
 903   // Zero length check.
 904   __ jcc(Assembler::lessEqual, L_exit_large);
 905   __ BIND(L_tail_large);
 906   // Tail handling using 64 byte [masked] vector copy operations.
 907   __ cmpq(temp1, 0);
 908   __ jcc(Assembler::lessEqual, L_exit_large);
 909   arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
 910                                temp4, temp3, L_exit_large);
 911   __ BIND(L_exit_large);
 912 }
 913 
 914 // Inputs:
 915 //   c_rarg0   - source array address
 916 //   c_rarg1   - destination array address
 917 //   c_rarg2   - element count, treated as ssize_t, can be zero
 918 //
 919 //
 920 address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, address* entry, address nooverlap_target) {
 921   // aligned is always false -- x86_64 always uses the unaligned code
 922   const bool aligned = false;
 923   int shift;
 924   bool is_oop;
 925   bool dest_uninitialized;
 926 
 927   switch (stub_id) {
 928   case StubId::stubgen_jbyte_arraycopy_id:
 929     shift = 0;
 930     is_oop = false;
 931     dest_uninitialized = false;
 932     break;
 933   case StubId::stubgen_jshort_arraycopy_id:
 934     shift = 1;
 935     is_oop = false;
 936     dest_uninitialized = false;
 937     break;
 938   case StubId::stubgen_jint_arraycopy_id:
 939     shift = 2;
 940     is_oop = false;
 941     dest_uninitialized = false;
 942     break;
 943   case StubId::stubgen_jlong_arraycopy_id:
 944     shift = 3;
 945     is_oop = false;
 946     dest_uninitialized = false;
 947     break;
 948   case StubId::stubgen_oop_arraycopy_id:
 949     shift = (UseCompressedOops ? 2 : 3);
 950     is_oop = true;
 951     dest_uninitialized = false;
 952     break;
 953   case StubId::stubgen_oop_arraycopy_uninit_id:
 954     shift = (UseCompressedOops ? 2 : 3);
 955     is_oop = true;
 956     dest_uninitialized = true;
 957     break;
 958   default:
 959     ShouldNotReachHere();
 960   }
 961   GrowableArray<address> entries;
 962   GrowableArray<address> extras;
 963   bool add_handlers = !is_oop && !aligned;
 964   bool add_relocs = UseZGC && is_oop;
 965   bool add_extras = add_handlers || add_relocs;
 966   int expected_entry_count = (entry != nullptr ? 2 : 1);
 967   int expected_handler_count = (add_handlers ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/1 x UMAM {start,end,handler}
 968   int entry_count = StubInfo::entry_count(stub_id);
 969   assert(entry_count == expected_entry_count, "sanity check");
 970   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
 971   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
 972   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 973   if (start != nullptr) {
 974     assert(entries.length() == expected_entry_count - 1,
 975            "unexpected entry count %d", entries.length());
 976     assert(!add_handlers || extras.length() == expected_handler_count,
 977            "unexpected handler addresses count %d", extras.length());
 978     if (entry != nullptr) {
 979       *entry = entries.at(0);
 980     }
 981     if (add_handlers) {
 982       // restore 1 x UMAM {start,end,handler} addresses from extras
 983       register_unsafe_access_handlers(extras, 0, 1);
 984     }
 985 #if INCLUDE_ZGC
 986     if (add_relocs)  {
 987       // register addresses at which ZGC does colour patching
 988       register_reloc_addresses(extras, 0, extras.length());
 989     }
 990 #endif // INCLUDE_ZGC
 991     return start;
 992   }
 993   __ align(CodeEntryAlignment);
 994   StubCodeMark mark(this, stub_id);
 995   start = __ pc();
 996 
 997   bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
 998 
 999   Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1000   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1001   const Register from        = rdi;  // source array address
1002   const Register to          = rsi;  // destination array address
1003   const Register count       = rdx;  // elements count
1004   const Register temp1       = r8;
1005   const Register temp2       = rcx;
1006   const Register temp3       = r11;
1007   const Register temp4       = rax;
1008   // End pointers are inclusive, and if count is not zero they point
1009   // to the last unit copied:  end_to[0] := end_from[0]
1010 
1011   __ enter(); // required for proper stackwalking of RuntimeStub frame
1012   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1013 
1014   if (entry != nullptr) {
1015     *entry = __ pc();
1016     entries.append(*entry);
1017      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1018     BLOCK_COMMENT("Entry:");
1019   }
1020 
1021   array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1022 
1023   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1024   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1025 
1026   setup_argument_regs(type);
1027 
1028   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1029   if (dest_uninitialized) {
1030     decorators |= IS_DEST_UNINITIALIZED;
1031   }
1032   if (aligned) {
1033     decorators |= ARRAYCOPY_ALIGNED;
1034   }
1035   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1036   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1037   {
1038     // Type(shift)       byte(0), short(1), int(2),   long(3)
1039     int loop_size[]   = { 192,     96,       48,      24};
1040     int threshold[]   = { 4096,    2048,     1024,    512};
1041 
1042     // UnsafeMemoryAccess page error: continue after unsafe access
1043     UnsafeMemoryAccessMark umam(this, add_handlers, true);
1044     // 'from', 'to' and 'count' are now valid
1045 
1046     // temp1 holds remaining count.
1047     __ movq(temp1, count);
1048 
1049     // Zero length check.
1050     __ BIND(L_tail);
1051     __ cmpq(temp1, 0);
1052     __ jcc(Assembler::lessEqual, L_exit);
1053 
1054     __ mov64(temp2, 0);
1055     __ movq(temp3, temp1);
1056     // Special cases using 32 byte [masked] vector copy operations.
1057     arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1058                                           temp4, use64byteVector, L_entry, L_exit);
1059 
1060     // PRE-MAIN-POST loop for aligned copy.
1061     __ BIND(L_entry);
1062 
1063     if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) {
1064       __ cmpq(temp1, threshold[shift]);
1065       __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1066     }
1067 
1068     if ((MaxVectorSize < 64)  || (CopyAVX3Threshold != 0)) {
1069       // Partial copy to make dst address 32 byte aligned.
1070       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1071       __ andq(temp2, 31);
1072       __ jcc(Assembler::equal, L_main_pre_loop);
1073 
1074       if (shift) {
1075         __ shrq(temp2, shift);
1076       }
1077       __ subq(temp1, temp2);
1078       copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1079 
1080       __ cmpq(temp1, loop_size[shift]);
1081       __ jcc(Assembler::less, L_tail);
1082 
1083       __ BIND(L_main_pre_loop);
1084 
1085       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1086       __ align32();
1087       __ BIND(L_main_loop);
1088          copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1089          copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1090          copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1091          __ subptr(temp1, loop_size[shift]);
1092          __ cmpq(temp1, loop_size[shift]);
1093          __ jcc(Assembler::greater, L_main_loop);
1094 
1095       // Tail loop.
1096       __ jmp(L_tail);
1097     }
1098 
1099     if (MaxVectorSize > 32) {
1100       __ BIND(L_pre_main_post_64);
1101       // Partial copy to make dst address 64 byte aligned.
1102       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1103       __ andq(temp2, 63);
1104       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1105 
1106       if (shift) {
1107         __ shrq(temp2, shift);
1108       }
1109       __ subq(temp1, temp2);
1110       copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1111 
1112       __ cmpq(temp1, loop_size[shift]);
1113       __ jcc(Assembler::less, L_tail64);
1114 
1115       __ BIND(L_main_pre_loop_64bytes);
1116 
1117       // Main loop with aligned copy block size of 192 bytes at
1118       // 64 byte copy granularity.
1119       __ align32();
1120       __ BIND(L_main_loop_64bytes);
1121          copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1122          copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1123          copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1124          __ subq(temp1, loop_size[shift]);
1125          __ cmpq(temp1, loop_size[shift]);
1126          __ jcc(Assembler::greater, L_main_loop_64bytes);
1127 
1128       // Zero length check.
1129       __ cmpq(temp1, 0);
1130       __ jcc(Assembler::lessEqual, L_exit);
1131 
1132       __ BIND(L_tail64);
1133 
1134       // Tail handling using 64 byte [masked] vector copy operations.
1135       use64byteVector = true;
1136       __ mov64(temp2, 0);
1137       __ movq(temp3, temp1);
1138       arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1139                                             temp4, use64byteVector, L_entry, L_exit);
1140     }
1141     __ BIND(L_exit);
1142   }
1143   address ucme_exit_pc = __ pc();
1144   // When called from generic_arraycopy r11 contains specific values
1145   // used during arraycopy epilogue, re-initializing r11.
1146   if(is_oop) {
1147     __ movq(r11, count);
1148   }
1149   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1150   restore_argument_regs(type);
1151   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
1152   __ xorptr(rax, rax); // return 0
1153   __ vzeroupper();
1154   __ leave(); // required for proper stackwalking of RuntimeStub frame
1155   __ ret(0);
1156 
1157   // retrieve the registered handler addresses
1158   address end = __ pc();
1159   if (add_handlers) {
1160     retrieve_unsafe_access_handlers(start, end, extras);
1161   }
1162   assert(extras.length() == expected_handler_count,
1163          "unexpected handler addresses count %d", extras.length());
1164 #if INCLUDE_ZGC
1165   // retrieve addresses at which ZGC does colour patching
1166   if (add_relocs) {
1167     retrieve_reloc_addresses(start, end, extras);
1168   }
1169 #endif // INCLUDE_ZGC
1170   // record the stub entry and end plus the no_push entry and any
1171   // extra handler addresses
1172   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
1173 
1174   return start;
1175 }
1176 
1177 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
1178                                                  Register to, Register count, int shift,
1179                                                  Register index, Register temp,
1180                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
1181   Label L_entry_64, L_entry_96, L_entry_128;
1182   Label L_entry_160, L_entry_192;
1183 
1184   int size_mat[][6] = {
1185   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1186   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1187   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1188   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1189   };
1190 
1191   // Case A) Special case for length less than equal to 32 bytes.
1192   __ cmpq(count, size_mat[shift][0]);
1193   __ jccb(Assembler::greater, L_entry_64);
1194   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
1195   __ jmp(L_exit);
1196 
1197   // Case B) Special case for length less than equal to 64 bytes.
1198   __ BIND(L_entry_64);
1199   __ cmpq(count, size_mat[shift][1]);
1200   __ jccb(Assembler::greater, L_entry_96);
1201   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1202   __ jmp(L_exit);
1203 
1204   // Case C) Special case for length less than equal to 96 bytes.
1205   __ BIND(L_entry_96);
1206   __ cmpq(count, size_mat[shift][2]);
1207   __ jccb(Assembler::greater, L_entry_128);
1208   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1209   __ subq(count, 64 >> shift);
1210   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1211   __ jmp(L_exit);
1212 
1213   // Case D) Special case for length less than equal to 128 bytes.
1214   __ BIND(L_entry_128);
1215   __ cmpq(count, size_mat[shift][3]);
1216   __ jccb(Assembler::greater, L_entry_160);
1217   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1218   copy32_avx(to, from, index, xmm, shift, 64);
1219   __ subq(count, 96 >> shift);
1220   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1221   __ jmp(L_exit);
1222 
1223   // Case E) Special case for length less than equal to 160 bytes.
1224   __ BIND(L_entry_160);
1225   __ cmpq(count, size_mat[shift][4]);
1226   __ jccb(Assembler::greater, L_entry_192);
1227   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1228   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1229   __ subq(count, 128 >> shift);
1230   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1231   __ jmp(L_exit);
1232 
1233   // Case F) Special case for length less than equal to 192 bytes.
1234   __ BIND(L_entry_192);
1235   __ cmpq(count, size_mat[shift][5]);
1236   __ jcc(Assembler::greater, L_entry);
1237   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1238   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1239   copy32_avx(to, from, index, xmm, shift, 128);
1240   __ subq(count, 160 >> shift);
1241   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1242   __ jmp(L_exit);
1243 }
1244 
1245 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1246                                                      Register to, Register count, int shift, Register index,
1247                                                      Register temp, Label& L_exit) {
1248   Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1249 
1250   int size_mat[][4] = {
1251   /* T_BYTE */ {64, 128, 192, 256},
1252   /* T_SHORT*/ {32, 64 , 96 , 128},
1253   /* T_INT  */ {16, 32 , 48 ,  64},
1254   /* T_LONG */ { 8, 16 , 24 ,  32}
1255   };
1256 
1257   assert(MaxVectorSize == 64, "vector length != 64");
1258   // Case A) Special case for length less than or equal to 64 bytes.
1259   __ BIND(L_entry_64);
1260   __ cmpq(count, size_mat[shift][0]);
1261   __ jccb(Assembler::greater, L_entry_128);
1262   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1263   __ jmp(L_exit);
1264 
1265   // Case B) Special case for length less than or equal to 128 bytes.
1266   __ BIND(L_entry_128);
1267   __ cmpq(count, size_mat[shift][1]);
1268   __ jccb(Assembler::greater, L_entry_192);
1269   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1270   __ subq(count, 64 >> shift);
1271   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1272   __ jmp(L_exit);
1273 
1274   // Case C) Special case for length less than or equal to 192 bytes.
1275   __ BIND(L_entry_192);
1276   __ cmpq(count, size_mat[shift][2]);
1277   __ jcc(Assembler::greater, L_entry_256);
1278   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1279   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1280   __ subq(count, 128 >> shift);
1281   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1282   __ jmp(L_exit);
1283 
1284   // Case D) Special case for length less than or equal to 256 bytes.
1285   __ BIND(L_entry_256);
1286   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1287   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1288   copy64_avx(to, from, index, xmm, false, shift, 128, true);
1289   __ subq(count, 192 >> shift);
1290   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1291   __ jmp(L_exit);
1292 }
1293 
1294 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1295                                                            Register to, Register start_index, Register end_index,
1296                                                            Register count, int shift, Register temp,
1297                                                            bool use64byteVector, Label& L_entry, Label& L_exit) {
1298   Label L_entry_64, L_entry_96, L_entry_128;
1299   Label L_entry_160, L_entry_192;
1300   bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
1301 
1302   int size_mat[][6] = {
1303   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1304   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1305   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1306   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1307   };
1308 
1309   // Case A) Special case for length less than equal to 32 bytes.
1310   __ cmpq(count, size_mat[shift][0]);
1311   __ jccb(Assembler::greater, L_entry_64);
1312   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1313   __ jmp(L_exit);
1314 
1315   // Case B) Special case for length less than equal to 64 bytes.
1316   __ BIND(L_entry_64);
1317   __ cmpq(count, size_mat[shift][1]);
1318   __ jccb(Assembler::greater, L_entry_96);
1319   if (avx3) {
1320      copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1321   } else {
1322      copy32_avx(to, from, end_index, xmm, shift, -32);
1323      __ subq(count, 32 >> shift);
1324      copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1325   }
1326   __ jmp(L_exit);
1327 
1328   // Case C) Special case for length less than equal to 96 bytes.
1329   __ BIND(L_entry_96);
1330   __ cmpq(count, size_mat[shift][2]);
1331   __ jccb(Assembler::greater, L_entry_128);
1332   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1333   __ subq(count, 64 >> shift);
1334   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1335   __ jmp(L_exit);
1336 
1337   // Case D) Special case for length less than equal to 128 bytes.
1338   __ BIND(L_entry_128);
1339   __ cmpq(count, size_mat[shift][3]);
1340   __ jccb(Assembler::greater, L_entry_160);
1341   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1342   copy32_avx(to, from, end_index, xmm, shift, -96);
1343   __ subq(count, 96 >> shift);
1344   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1345   __ jmp(L_exit);
1346 
1347   // Case E) Special case for length less than equal to 160 bytes.
1348   __ BIND(L_entry_160);
1349   __ cmpq(count, size_mat[shift][4]);
1350   __ jccb(Assembler::greater, L_entry_192);
1351   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1352   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1353   __ subq(count, 128 >> shift);
1354   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1355   __ jmp(L_exit);
1356 
1357   // Case F) Special case for length less than equal to 192 bytes.
1358   __ BIND(L_entry_192);
1359   __ cmpq(count, size_mat[shift][5]);
1360   __ jcc(Assembler::greater, L_entry);
1361   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1362   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1363   copy32_avx(to, from, end_index, xmm, shift, -160);
1364   __ subq(count, 160 >> shift);
1365   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1366   __ jmp(L_exit);
1367 }
1368 
1369 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1370                                 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1371                                 int shift, int offset) {
1372   if (MaxVectorSize == 64) {
1373     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1374     __ prefetcht0(Address(src, index, scale, offset + 0x200));
1375     __ prefetcht0(Address(src, index, scale, offset + 0x240));
1376     __ prefetcht0(Address(src, index, scale, offset + 0x280));
1377     __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1378 
1379     __ prefetcht0(Address(src, index, scale, offset + 0x400));
1380     __ prefetcht0(Address(src, index, scale, offset + 0x440));
1381     __ prefetcht0(Address(src, index, scale, offset + 0x480));
1382     __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1383 
1384     __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1385     __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1386     __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1387     __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1388 
1389     __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1390     __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1391     __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1392     __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1393   }
1394 }
1395 
1396 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1397                                        KRegister mask, Register length, Register index,
1398                                        Register temp, int shift, int offset,
1399                                        bool use64byteVector) {
1400   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1401   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1402   if (!use64byteVector) {
1403     copy32_avx(dst, src, index, xmm, shift, offset);
1404     __ subptr(length, 32 >> shift);
1405     copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1406   } else {
1407     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1408     assert(MaxVectorSize == 64, "vector length != 64");
1409     __ mov64(temp, -1L);
1410     __ bzhiq(temp, temp, length);
1411     __ kmovql(mask, temp);
1412     __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1413     __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1414   }
1415 }
1416 
1417 
1418 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1419                                        KRegister mask, Register length, Register index,
1420                                        Register temp, int shift, int offset) {
1421   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1422   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1423   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1424   __ mov64(temp, -1L);
1425   __ bzhiq(temp, temp, length);
1426   __ kmovql(mask, temp);
1427   __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1428   __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1429 }
1430 
1431 
1432 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1433                                 int shift, int offset) {
1434   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1435   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1436   __ vmovdqu(xmm, Address(src, index, scale, offset));
1437   __ vmovdqu(Address(dst, index, scale, offset), xmm);
1438 }
1439 
1440 
1441 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1442                                 bool conjoint, int shift, int offset, bool use64byteVector) {
1443   assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1444   if (!use64byteVector) {
1445     if (conjoint) {
1446       copy32_avx(dst, src, index, xmm, shift, offset+32);
1447       copy32_avx(dst, src, index, xmm, shift, offset);
1448     } else {
1449       copy32_avx(dst, src, index, xmm, shift, offset);
1450       copy32_avx(dst, src, index, xmm, shift, offset+32);
1451     }
1452   } else {
1453     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1454     __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1455     __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1456   }
1457 }
1458 
1459 #endif // COMPILER2
1460 
1461 
1462 // Arguments:
1463 //   entry     - location for return of (post-push) entry
1464 //
1465 // Inputs:
1466 //   c_rarg0   - source array address
1467 //   c_rarg1   - destination array address
1468 //   c_rarg2   - element count, treated as ssize_t, can be zero
1469 //
1470 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1471 // we let the hardware handle it.  The one to eight bytes within words,
1472 // dwords or qwords that span cache line boundaries will still be loaded
1473 // and stored atomically.
1474 //
1475 // Side Effects:
1476 //   entry is set to the no-overlap entry point
1477 //   used by generate_conjoint_byte_copy().
1478 //
1479 address StubGenerator::generate_disjoint_byte_copy(address* entry) {
1480   StubId stub_id = StubId::stubgen_jbyte_disjoint_arraycopy_id;
1481   // aligned is always false -- x86_64 always uses the unaligned code
1482   const bool aligned = false;
1483 #ifdef COMPILER2
1484   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1485     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1486   }
1487 #endif // COMPILER2
1488   GrowableArray<address> entries;
1489   GrowableArray<address> extras;
1490   int expected_entry_count = (entry != nullptr ? 2 : 1);
1491   int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1492   int entry_count = StubInfo::entry_count(stub_id);
1493   assert(entry_count == expected_entry_count, "sanity check");
1494   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1495   address start = load_archive_data(stub_id, entries_ptr, &extras);
1496   if (start != nullptr) {
1497     assert(entries.length() == expected_entry_count - 1,
1498            "unexpected entry count %d", entries.length());
1499     assert(extras.length() == expected_handler_count,
1500            "unexpected handler addresses count %d", extras.length());
1501     if (entry != nullptr) {
1502       *entry = entries.at(0);
1503     }
1504     // restore 2 UMAM {start,end,handler} addresses from extras
1505     register_unsafe_access_handlers(extras, 0, 2);
1506     return start;
1507   }
1508   __ align(CodeEntryAlignment);
1509   StubCodeMark mark(this, stub_id);
1510   start = __ pc();
1511   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1512 
1513   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1514   Label L_copy_byte, L_exit;
1515   const Register from        = rdi;  // source array address
1516   const Register to          = rsi;  // destination array address
1517   const Register count       = rdx;  // elements count
1518   const Register byte_count  = rcx;
1519   const Register qword_count = count;
1520   const Register end_from    = from; // source array end address
1521   const Register end_to      = to;   // destination array end address
1522   // End pointers are inclusive, and if count is not zero they point
1523   // to the last unit copied:  end_to[0] := end_from[0]
1524 
1525   __ enter(); // required for proper stackwalking of RuntimeStub frame
1526   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1527 
1528   if (entry != nullptr) {
1529     *entry = __ pc();
1530     entries.append(*entry);
1531      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1532     BLOCK_COMMENT("Entry:");
1533   }
1534 
1535   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1536                     // r9 and r10 may be used to save non-volatile registers
1537 
1538   {
1539     // UnsafeMemoryAccess page error: continue after unsafe access
1540     UnsafeMemoryAccessMark umam(this, !aligned, true);
1541     // 'from', 'to' and 'count' are now valid
1542     __ movptr(byte_count, count);
1543     __ shrptr(count, 3); // count => qword_count
1544 
1545     // Copy from low to high addresses.  Use 'to' as scratch.
1546     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1547     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1548     __ negptr(qword_count); // make the count negative
1549     __ jmp(L_copy_bytes);
1550 
1551     // Copy trailing qwords
1552   __ BIND(L_copy_8_bytes);
1553     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1554     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1555     __ increment(qword_count);
1556     __ jcc(Assembler::notZero, L_copy_8_bytes);
1557 
1558     // Check for and copy trailing dword
1559   __ BIND(L_copy_4_bytes);
1560     __ testl(byte_count, 4);
1561     __ jccb(Assembler::zero, L_copy_2_bytes);
1562     __ movl(rax, Address(end_from, 8));
1563     __ movl(Address(end_to, 8), rax);
1564 
1565     __ addptr(end_from, 4);
1566     __ addptr(end_to, 4);
1567 
1568     // Check for and copy trailing word
1569   __ BIND(L_copy_2_bytes);
1570     __ testl(byte_count, 2);
1571     __ jccb(Assembler::zero, L_copy_byte);
1572     __ movw(rax, Address(end_from, 8));
1573     __ movw(Address(end_to, 8), rax);
1574 
1575     __ addptr(end_from, 2);
1576     __ addptr(end_to, 2);
1577 
1578     // Check for and copy trailing byte
1579   __ BIND(L_copy_byte);
1580     __ testl(byte_count, 1);
1581     __ jccb(Assembler::zero, L_exit);
1582     __ movb(rax, Address(end_from, 8));
1583     __ movb(Address(end_to, 8), rax);
1584   }
1585 __ BIND(L_exit);
1586   address ucme_exit_pc = __ pc();
1587   restore_arg_regs();
1588   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1589   __ xorptr(rax, rax); // return 0
1590   __ vzeroupper();
1591   __ leave(); // required for proper stackwalking of RuntimeStub frame
1592   __ ret(0);
1593 
1594   {
1595     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1596     // Copy in multi-bytes chunks
1597     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1598     __ jmp(L_copy_4_bytes);
1599   }
1600 
1601   // retrieve the registered handler addresses
1602   address end = __ pc();
1603   retrieve_unsafe_access_handlers(start, end, extras);
1604   assert(extras.length() == expected_handler_count,
1605          "unexpected handler addresses count %d", extras.length());
1606 
1607   // record the stub entry and end plus the no_push entry and any
1608   // extra handler addresses
1609   store_archive_data(stub_id, start, end, entries_ptr, &extras);
1610 
1611   return start;
1612 }
1613 
1614 
1615 // Arguments:
1616 //   entry     - location for return of (post-push) entry
1617 //   nooverlap_target - entry to branch to if no overlap detected
1618 //
1619 // Inputs:
1620 //   c_rarg0   - source array address
1621 //   c_rarg1   - destination array address
1622 //   c_rarg2   - element count, treated as ssize_t, can be zero
1623 //
1624 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1625 // we let the hardware handle it.  The one to eight bytes within words,
1626 // dwords or qwords that span cache line boundaries will still be loaded
1627 // and stored atomically.
1628 //
1629 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) {
1630   StubId stub_id = StubId::stubgen_jbyte_arraycopy_id;
1631   // aligned is always false -- x86_64 always uses the unaligned code
1632   const bool aligned = false;
1633 #ifdef COMPILER2
1634   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1635     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1636   }
1637 #endif // COMPILER2
1638   GrowableArray<address> entries;
1639   GrowableArray<address> extras;
1640   int expected_entry_count = (entry != nullptr ? 2 : 1);
1641   int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1642   int entry_count = StubInfo::entry_count(stub_id);
1643   assert(entry_count == expected_entry_count, "sanity check");
1644   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1645   address start = load_archive_data(stub_id, entries_ptr, &extras);
1646   if (start != nullptr) {
1647     assert(entries.length() == expected_entry_count - 1,
1648            "unexpected entry count %d", entries.length());
1649     assert(extras.length() == expected_handler_count,
1650            "unexpected handler addresses count %d", extras.length());
1651     if (entry != nullptr) {
1652       *entry = entries.at(0);
1653     }
1654     // restore 2 UMAM {start,end,handler} addresses from extras
1655     register_unsafe_access_handlers(extras, 0, 2);
1656     return start;
1657   }
1658   __ align(CodeEntryAlignment);
1659   StubCodeMark mark(this, stub_id);
1660   start = __ pc();
1661   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1662 
1663   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1664   const Register from        = rdi;  // source array address
1665   const Register to          = rsi;  // destination array address
1666   const Register count       = rdx;  // elements count
1667   const Register byte_count  = rcx;
1668   const Register qword_count = count;
1669 
1670   __ enter(); // required for proper stackwalking of RuntimeStub frame
1671   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1672 
1673   if (entry != nullptr) {
1674     *entry = __ pc();
1675     entries.append(*entry);
1676     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1677     BLOCK_COMMENT("Entry:");
1678   }
1679 
1680   array_overlap_test(nooverlap_target, Address::times_1);
1681   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1682                     // r9 and r10 may be used to save non-volatile registers
1683 
1684   {
1685     // UnsafeMemoryAccess page error: continue after unsafe access
1686     UnsafeMemoryAccessMark umam(this, !aligned, true);
1687     // 'from', 'to' and 'count' are now valid
1688     __ movptr(byte_count, count);
1689     __ shrptr(count, 3);   // count => qword_count
1690 
1691     // Copy from high to low addresses.
1692 
1693     // Check for and copy trailing byte
1694     __ testl(byte_count, 1);
1695     __ jcc(Assembler::zero, L_copy_2_bytes);
1696     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1697     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1698     __ decrement(byte_count); // Adjust for possible trailing word
1699 
1700     // Check for and copy trailing word
1701   __ BIND(L_copy_2_bytes);
1702     __ testl(byte_count, 2);
1703     __ jcc(Assembler::zero, L_copy_4_bytes);
1704     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1705     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1706 
1707     // Check for and copy trailing dword
1708   __ BIND(L_copy_4_bytes);
1709     __ testl(byte_count, 4);
1710     __ jcc(Assembler::zero, L_copy_bytes);
1711     __ movl(rax, Address(from, qword_count, Address::times_8));
1712     __ movl(Address(to, qword_count, Address::times_8), rax);
1713     __ jmp(L_copy_bytes);
1714 
1715     // Copy trailing qwords
1716   __ BIND(L_copy_8_bytes);
1717     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1718     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1719     __ decrement(qword_count);
1720     __ jcc(Assembler::notZero, L_copy_8_bytes);
1721   }
1722   restore_arg_regs();
1723   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1724   __ xorptr(rax, rax); // return 0
1725   __ vzeroupper();
1726   __ leave(); // required for proper stackwalking of RuntimeStub frame
1727   __ ret(0);
1728 
1729   {
1730     // UnsafeMemoryAccess page error: continue after unsafe access
1731     UnsafeMemoryAccessMark umam(this, !aligned, true);
1732     // Copy in multi-bytes chunks
1733     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1734   }
1735   restore_arg_regs();
1736   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1737   __ xorptr(rax, rax); // return 0
1738   __ vzeroupper();
1739   __ leave(); // required for proper stackwalking of RuntimeStub frame
1740   __ ret(0);
1741 
1742   // retrieve the registered handler addresses
1743   address end = __ pc();
1744   retrieve_unsafe_access_handlers(start, end, extras);
1745   assert(extras.length() == expected_handler_count,
1746          "unexpected handler addresses count %d", extras.length());
1747 
1748   // record the stub entry and end plus the no_push entry and any
1749   // extra handler addresses
1750   store_archive_data(stub_id, start, end, entries_ptr, &extras);
1751 
1752   return start;
1753 }
1754 
1755 
1756 // Arguments:
1757 //   entry     - location for return of (post-push) entry
1758 //
1759 // Inputs:
1760 //   c_rarg0   - source array address
1761 //   c_rarg1   - destination array address
1762 //   c_rarg2   - element count, treated as ssize_t, can be zero
1763 //
1764 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1765 // let the hardware handle it.  The two or four words within dwords
1766 // or qwords that span cache line boundaries will still be loaded
1767 // and stored atomically.
1768 //
1769 // Side Effects:
1770 //   entry is set to the no-overlap entry point
1771 //   used by generate_conjoint_short_copy().
1772 //
1773 address StubGenerator::generate_disjoint_short_copy(address *entry) {
1774   StubId stub_id = StubId::stubgen_jshort_disjoint_arraycopy_id;
1775   // aligned is always false -- x86_64 always uses the unaligned code
1776   const bool aligned = false;
1777 #ifdef COMPILER2
1778   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1779     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1780   }
1781 #endif // COMPILER2
1782   GrowableArray<address> entries;
1783   GrowableArray<address> extras;
1784   int expected_entry_count = (entry != nullptr ? 2 : 1);
1785   int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1786   int entry_count = StubInfo::entry_count(stub_id);
1787   assert(entry_count == expected_entry_count, "sanity check");
1788   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1789   address start = load_archive_data(stub_id, entries_ptr, &extras);
1790   if (start != nullptr) {
1791     assert(entries.length() == expected_entry_count - 1,
1792            "unexpected entry count %d", entries.length());
1793     assert(extras.length() == expected_handler_count,
1794            "unexpected handler addresses count %d", extras.length());
1795     if (entry != nullptr) {
1796       *entry = entries.at(0);
1797     }
1798     // restore 2 UMAM {start,end,handler} addresses from extras
1799     register_unsafe_access_handlers(extras, 0, 2);
1800     return start;
1801   }
1802   __ align(CodeEntryAlignment);
1803   StubCodeMark mark(this, stub_id);
1804   start = __ pc();
1805   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1806 
1807   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1808   const Register from        = rdi;  // source array address
1809   const Register to          = rsi;  // destination array address
1810   const Register count       = rdx;  // elements count
1811   const Register word_count  = rcx;
1812   const Register qword_count = count;
1813   const Register end_from    = from; // source array end address
1814   const Register end_to      = to;   // destination array end address
1815   // End pointers are inclusive, and if count is not zero they point
1816   // to the last unit copied:  end_to[0] := end_from[0]
1817 
1818   __ enter(); // required for proper stackwalking of RuntimeStub frame
1819   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1820 
1821   if (entry != nullptr) {
1822     *entry = __ pc();
1823     entries.append(*entry);
1824     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1825     BLOCK_COMMENT("Entry:");
1826   }
1827 
1828   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1829                     // r9 and r10 may be used to save non-volatile registers
1830 
1831   {
1832     // UnsafeMemoryAccess page error: continue after unsafe access
1833     UnsafeMemoryAccessMark umam(this, !aligned, true);
1834     // 'from', 'to' and 'count' are now valid
1835     __ movptr(word_count, count);
1836     __ shrptr(count, 2); // count => qword_count
1837 
1838     // Copy from low to high addresses.  Use 'to' as scratch.
1839     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1840     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1841     __ negptr(qword_count);
1842     __ jmp(L_copy_bytes);
1843 
1844     // Copy trailing qwords
1845   __ BIND(L_copy_8_bytes);
1846     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1847     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1848     __ increment(qword_count);
1849     __ jcc(Assembler::notZero, L_copy_8_bytes);
1850 
1851     // Original 'dest' is trashed, so we can't use it as a
1852     // base register for a possible trailing word copy
1853 
1854     // Check for and copy trailing dword
1855   __ BIND(L_copy_4_bytes);
1856     __ testl(word_count, 2);
1857     __ jccb(Assembler::zero, L_copy_2_bytes);
1858     __ movl(rax, Address(end_from, 8));
1859     __ movl(Address(end_to, 8), rax);
1860 
1861     __ addptr(end_from, 4);
1862     __ addptr(end_to, 4);
1863 
1864     // Check for and copy trailing word
1865   __ BIND(L_copy_2_bytes);
1866     __ testl(word_count, 1);
1867     __ jccb(Assembler::zero, L_exit);
1868     __ movw(rax, Address(end_from, 8));
1869     __ movw(Address(end_to, 8), rax);
1870   }
1871 __ BIND(L_exit);
1872   address ucme_exit_pc = __ pc();
1873   restore_arg_regs();
1874   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1875   __ xorptr(rax, rax); // return 0
1876   __ vzeroupper();
1877   __ leave(); // required for proper stackwalking of RuntimeStub frame
1878   __ ret(0);
1879 
1880   {
1881     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1882     // Copy in multi-bytes chunks
1883     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1884     __ jmp(L_copy_4_bytes);
1885   }
1886 
1887   // retrieve the registered handler addresses
1888   address end = __ pc();
1889   retrieve_unsafe_access_handlers(start, end, extras);
1890   assert(extras.length() == expected_handler_count,
1891          "unexpected handler addresses count %d", extras.length());
1892 
1893   // record the stub entry and end plus the no_push entry and any
1894   // extra handler addresses
1895   store_archive_data(stub_id, start, end, entries_ptr, &extras);
1896 
1897   return start;
1898 }
1899 
1900 
1901 address StubGenerator::generate_fill(StubId stub_id) {
1902   BasicType t;
1903   bool aligned;
1904   switch (stub_id) {
1905   case StubId::stubgen_jbyte_fill_id:
1906     t = T_BYTE;
1907     aligned = false;
1908     break;
1909   case StubId::stubgen_jshort_fill_id:
1910     t = T_SHORT;
1911     aligned = false;
1912     break;
1913   case StubId::stubgen_jint_fill_id:
1914     t = T_INT;
1915     aligned = false;
1916     break;
1917   case StubId::stubgen_arrayof_jbyte_fill_id:
1918     t = T_BYTE;
1919     aligned = true;
1920     break;
1921   case StubId::stubgen_arrayof_jshort_fill_id:
1922     t = T_SHORT;
1923     aligned = true;
1924     break;
1925   case StubId::stubgen_arrayof_jint_fill_id:
1926     t = T_INT;
1927     aligned = true;
1928     break;
1929   default:
1930     ShouldNotReachHere();
1931   }
1932   int entry_count = StubInfo::entry_count(stub_id);
1933   assert(entry_count == 1, "sanity check");
1934   GrowableArray<address> extras;
1935   bool add_handlers = ((t == T_BYTE) && !aligned);
1936   int handlers_count = (add_handlers ? 1 : 0);
1937   int expected_extras_count = (handlers_count * UnsafeMemoryAccess::COLUMN_COUNT); // 0/1 x UMAM {start,end,handler}
1938   GrowableArray<address>* extras_ptr = (add_handlers ? &extras : nullptr);
1939   address start = load_archive_data(stub_id, nullptr, extras_ptr);
1940   if (start != nullptr) {
1941     assert(extras.length() == expected_extras_count,
1942            "unexpected handler addresses count %d", extras.length());
1943     if (add_handlers) {
1944       // restore 1 x UMAM {start,end,handler} addresses from extras
1945       register_unsafe_access_handlers(extras, 0, 1);
1946     }
1947     return start;
1948   }
1949 
1950   __ align(CodeEntryAlignment);
1951   StubCodeMark mark(this, stub_id);
1952   start = __ pc();
1953 
1954   BLOCK_COMMENT("Entry:");
1955 
1956   const Register to       = c_rarg0;  // destination array address
1957   const Register value    = c_rarg1;  // value
1958   const Register count    = c_rarg2;  // elements count
1959   __ mov(r11, count);
1960 
1961   __ enter(); // required for proper stackwalking of RuntimeStub frame
1962 
1963   {
1964     // Add set memory mark to protect against unsafe accesses faulting
1965     UnsafeMemoryAccessMark umam(this, add_handlers, true);
1966     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1967   }
1968 
1969   __ vzeroupper();
1970   __ leave(); // required for proper stackwalking of RuntimeStub frame
1971   __ ret(0);
1972 
1973   address end = __ pc();
1974   if (add_handlers) {
1975     retrieve_unsafe_access_handlers(start, end, extras);
1976   }
1977   assert(extras.length() == expected_extras_count,
1978          "unexpected handler addresses count %d", extras.length());
1979   // record the stub entry and end
1980   store_archive_data(stub_id, start, end, nullptr, extras_ptr);
1981 
1982   return start;
1983 }
1984 
1985 
1986 // Arguments:
1987 //   entry     - location for return of (post-push) entry
1988 //   nooverlap_target - entry to branch to if no overlap detected
1989 //
1990 // Inputs:
1991 //   c_rarg0   - source array address
1992 //   c_rarg1   - destination array address
1993 //   c_rarg2   - element count, treated as ssize_t, can be zero
1994 //
1995 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1996 // let the hardware handle it.  The two or four words within dwords
1997 // or qwords that span cache line boundaries will still be loaded
1998 // and stored atomically.
1999 //
2000 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) {
2001   StubId stub_id = StubId::stubgen_jshort_arraycopy_id;
2002   // aligned is always false -- x86_64 always uses the unaligned code
2003   const bool aligned = false;
2004 #ifdef COMPILER2
2005   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2006     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2007   }
2008 #endif // COMPILER2
2009   GrowableArray<address> entries;
2010   GrowableArray<address> extras;
2011   int expected_entry_count = (entry != nullptr ? 2 : 1);
2012   int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
2013   int entry_count = StubInfo::entry_count(stub_id);
2014   assert(entry_count == expected_entry_count, "sanity check");
2015   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2016   address start = load_archive_data(stub_id, entries_ptr, &extras);
2017   if (start != nullptr) {
2018     assert(entries.length() == expected_entry_count - 1,
2019            "unexpected entry count %d", entries.length());
2020     assert(extras.length() == expected_handler_count,
2021            "unexpected handler addresses count %d", extras.length());
2022     if (entry != nullptr) {
2023       *entry = entries.at(0);
2024     }
2025     // restore 2 UMAM {start,end,handler} addresses from extras
2026     register_unsafe_access_handlers(extras, 0, 2);
2027     return start;
2028   }
2029   __ align(CodeEntryAlignment);
2030   StubCodeMark mark(this, stub_id);
2031   start = __ pc();
2032   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2033 
2034   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2035   const Register from        = rdi;  // source array address
2036   const Register to          = rsi;  // destination array address
2037   const Register count       = rdx;  // elements count
2038   const Register word_count  = rcx;
2039   const Register qword_count = count;
2040 
2041   __ enter(); // required for proper stackwalking of RuntimeStub frame
2042   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2043 
2044   if (entry != nullptr) {
2045     *entry = __ pc();
2046     entries.append(*entry);
2047     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2048     BLOCK_COMMENT("Entry:");
2049   }
2050 
2051   array_overlap_test(nooverlap_target, Address::times_2);
2052   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2053                     // r9 and r10 may be used to save non-volatile registers
2054 
2055   {
2056     // UnsafeMemoryAccess page error: continue after unsafe access
2057     UnsafeMemoryAccessMark umam(this, !aligned, true);
2058     // 'from', 'to' and 'count' are now valid
2059     __ movptr(word_count, count);
2060     __ shrptr(count, 2); // count => qword_count
2061 
2062     // Copy from high to low addresses.  Use 'to' as scratch.
2063 
2064     // Check for and copy trailing word
2065     __ testl(word_count, 1);
2066     __ jccb(Assembler::zero, L_copy_4_bytes);
2067     __ movw(rax, Address(from, word_count, Address::times_2, -2));
2068     __ movw(Address(to, word_count, Address::times_2, -2), rax);
2069 
2070    // Check for and copy trailing dword
2071   __ BIND(L_copy_4_bytes);
2072     __ testl(word_count, 2);
2073     __ jcc(Assembler::zero, L_copy_bytes);
2074     __ movl(rax, Address(from, qword_count, Address::times_8));
2075     __ movl(Address(to, qword_count, Address::times_8), rax);
2076     __ jmp(L_copy_bytes);
2077 
2078     // Copy trailing qwords
2079   __ BIND(L_copy_8_bytes);
2080     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2081     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2082     __ decrement(qword_count);
2083     __ jcc(Assembler::notZero, L_copy_8_bytes);
2084   }
2085   restore_arg_regs();
2086   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2087   __ xorptr(rax, rax); // return 0
2088   __ vzeroupper();
2089   __ leave(); // required for proper stackwalking of RuntimeStub frame
2090   __ ret(0);
2091 
2092   {
2093     // UnsafeMemoryAccess page error: continue after unsafe access
2094     UnsafeMemoryAccessMark umam(this, !aligned, true);
2095     // Copy in multi-bytes chunks
2096     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
2097   }
2098   restore_arg_regs();
2099   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2100   __ xorptr(rax, rax); // return 0
2101   __ vzeroupper();
2102   __ leave(); // required for proper stackwalking of RuntimeStub frame
2103   __ ret(0);
2104 
2105   // retrieve the registered handler addresses
2106   address end = __ pc();
2107   retrieve_unsafe_access_handlers(start, end, extras);
2108   assert(extras.length() == expected_handler_count,
2109          "unexpected handler addresses count %d", extras.length());
2110 
2111   // record the stub entry and end plus the no_push entry and any
2112   // extra handler addresses
2113   store_archive_data(stub_id, start, end, entries_ptr, &extras);
2114 
2115   return start;
2116 }
2117 
2118 
2119 // Arguments:
2120 //   stub_id   - unqiue id for stub to generate
2121 //   entry     - location for return of (post-push) entry
2122 //   is_oop    - true => oop array, so generate store check code
2123 //
2124 // Inputs:
2125 //   c_rarg0   - source array address
2126 //   c_rarg1   - destination array address
2127 //   c_rarg2   - element count, treated as ssize_t, can be zero
2128 //
2129 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2130 // the hardware handle it.  The two dwords within qwords that span
2131 // cache line boundaries will still be loaded and stored atomically.
2132 //
2133 // Side Effects:
2134 //   disjoint_int_copy_entry is set to the no-overlap entry point
2135 //   used by generate_conjoint_int_oop_copy().
2136 //
2137 address StubGenerator::generate_disjoint_int_oop_copy(StubId stub_id, address* entry) {
2138   // aligned is always false -- x86_64 always uses the unaligned code
2139   const bool aligned = false;
2140   bool is_oop;
2141   bool dest_uninitialized;
2142   switch (stub_id) {
2143   case StubId::stubgen_jint_disjoint_arraycopy_id:
2144     is_oop = false;
2145     dest_uninitialized = false;
2146     break;
2147   case StubId::stubgen_oop_disjoint_arraycopy_id:
2148     assert(UseCompressedOops, "inconsistent oop copy size!");
2149     is_oop = true;
2150     dest_uninitialized = false;
2151     break;
2152   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2153     assert(UseCompressedOops, "inconsistent oop copy size!");
2154     is_oop = true;
2155     dest_uninitialized = true;
2156     break;
2157   default:
2158     ShouldNotReachHere();
2159   }
2160 
2161   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2162 #ifdef COMPILER2
2163   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2164     return generate_disjoint_copy_avx3_masked(stub_id, entry);
2165   }
2166 #endif // COMPILER2
2167   GrowableArray<address> entries;
2168   GrowableArray<address> extras;
2169   bool add_handlers = !is_oop && !aligned;
2170   bool add_relocs = UseZGC && is_oop;
2171   bool add_extras = add_handlers || add_relocs;
2172   int expected_entry_count = (entry != nullptr ? 2 : 1);
2173   int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2174   int entry_count = StubInfo::entry_count(stub_id);
2175   assert(entry_count == expected_entry_count, "sanity check");
2176   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2177   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2178   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2179   if (start != nullptr) {
2180     assert(entries.length() == expected_entry_count - 1,
2181            "unexpected entry count %d", entries.length());
2182     assert(!add_handlers || extras.length() == expected_handler_count,
2183            "unexpected handler addresses count %d", extras.length());
2184     if (entry != nullptr) {
2185       *entry = entries.at(0);
2186     }
2187     if (add_handlers) {
2188       // restore 2 UMAM {start,end,handler} addresses from extras
2189       register_unsafe_access_handlers(extras, 0, 2);
2190     }
2191 #if INCLUDE_ZGC
2192     // register addresses at which ZGC does colour patching
2193     if (add_relocs)  {
2194       register_reloc_addresses(extras, 0, extras.length());
2195     }
2196 #endif // INCLUDE_ZGC
2197     return start;
2198   }
2199 
2200   __ align(CodeEntryAlignment);
2201   StubCodeMark mark(this, stub_id);
2202   start = __ pc();
2203 
2204   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2205   const Register from        = rdi;  // source array address
2206   const Register to          = rsi;  // destination array address
2207   const Register count       = rdx;  // elements count
2208   const Register dword_count = rcx;
2209   const Register qword_count = count;
2210   const Register end_from    = from; // source array end address
2211   const Register end_to      = to;   // destination array end address
2212   // End pointers are inclusive, and if count is not zero they point
2213   // to the last unit copied:  end_to[0] := end_from[0]
2214 
2215   __ enter(); // required for proper stackwalking of RuntimeStub frame
2216   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2217 
2218   if (entry != nullptr) {
2219     *entry = __ pc();
2220     entries.append(*entry);
2221     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2222     BLOCK_COMMENT("Entry:");
2223   }
2224 
2225   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2226                                  // r9 is used to save r15_thread
2227 
2228   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2229   if (dest_uninitialized) {
2230     decorators |= IS_DEST_UNINITIALIZED;
2231   }
2232   if (aligned) {
2233     decorators |= ARRAYCOPY_ALIGNED;
2234   }
2235 
2236   BasicType type = is_oop ? T_OBJECT : T_INT;
2237   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2238 
2239   {
2240     // UnsafeMemoryAccess page error: continue after unsafe access
2241     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2242     // 'from', 'to' and 'count' are now valid
2243     __ movptr(dword_count, count);
2244     __ shrptr(count, 1); // count => qword_count
2245 
2246     // Copy from low to high addresses.  Use 'to' as scratch.
2247     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2248     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2249     __ negptr(qword_count);
2250     __ jmp(L_copy_bytes);
2251 
2252     // Copy trailing qwords
2253     __ BIND(L_copy_8_bytes);
2254     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2255     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2256     __ increment(qword_count);
2257     __ jcc(Assembler::notZero, L_copy_8_bytes);
2258 
2259     // Check for and copy trailing dword
2260     __ BIND(L_copy_4_bytes);
2261     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2262     __ jccb(Assembler::zero, L_exit);
2263     __ movl(rax, Address(end_from, 8));
2264     __ movl(Address(end_to, 8), rax);
2265   }
2266   __ BIND(L_exit);
2267   address ucme_exit_pc = __ pc();
2268   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2269   restore_arg_regs_using_thread();
2270   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2271   __ vzeroupper();
2272   __ xorptr(rax, rax); // return 0
2273   __ leave(); // required for proper stackwalking of RuntimeStub frame
2274   __ ret(0);
2275 
2276   {
2277     UnsafeMemoryAccessMark umam(this, add_handlers, false, ucme_exit_pc);
2278     // Copy in multi-bytes chunks
2279     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2280     __ jmp(L_copy_4_bytes);
2281   }
2282 
2283   // retrieve the registered handler addresses
2284   address end = __ pc();
2285   if (add_handlers) {
2286     retrieve_unsafe_access_handlers(start, end, extras);
2287   }
2288   assert(extras.length() == expected_handler_count,
2289          "unexpected handler addresses count %d", extras.length());
2290 #if INCLUDE_ZGC
2291   // retrieve addresses at which ZGC does colour patching
2292   if (add_relocs) {
2293     retrieve_reloc_addresses(start, end, extras);
2294   }
2295 #endif // INCLUDE_ZGC
2296 
2297   // record the stub entry and end plus the no_push entry and any
2298   // extra handler addresses
2299   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2300 
2301   return start;
2302 }
2303 
2304 
2305 // Arguments:
2306 //   entry     - location for return of (post-push) entry
2307 //   nooverlap_target - entry to branch to if no overlap detected
2308 //   is_oop  - true => oop array, so generate store check code
2309 //
2310 // Inputs:
2311 //   c_rarg0   - source array address
2312 //   c_rarg1   - destination array address
2313 //   c_rarg2   - element count, treated as ssize_t, can be zero
2314 //
2315 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2316 // the hardware handle it.  The two dwords within qwords that span
2317 // cache line boundaries will still be loaded and stored atomically.
2318 //
2319 address StubGenerator::generate_conjoint_int_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2320   // aligned is always false -- x86_64 always uses the unaligned code
2321   const bool aligned = false;
2322   bool is_oop;
2323   bool dest_uninitialized;
2324   switch (stub_id) {
2325   case StubId::stubgen_jint_arraycopy_id:
2326     is_oop = false;
2327     dest_uninitialized = false;
2328     break;
2329   case StubId::stubgen_oop_arraycopy_id:
2330     assert(UseCompressedOops, "inconsistent oop copy size!");
2331     is_oop = true;
2332     dest_uninitialized = false;
2333     break;
2334   case StubId::stubgen_oop_arraycopy_uninit_id:
2335     assert(UseCompressedOops, "inconsistent oop copy size!");
2336     is_oop = true;
2337     dest_uninitialized = true;
2338     break;
2339   default:
2340     ShouldNotReachHere();
2341   }
2342 
2343   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2344 #ifdef COMPILER2
2345   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2346     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2347   }
2348 #endif // COMPILER2
2349   bool add_handlers = !is_oop && !aligned;
2350   bool add_relocs = UseZGC && is_oop;
2351   bool add_extras = add_handlers || add_relocs;
2352   GrowableArray<address> entries;
2353   GrowableArray<address> extras;
2354   int expected_entry_count = (entry != nullptr ? 2 : 1);
2355   int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2356   int entry_count = StubInfo::entry_count(stub_id);
2357   assert(entry_count == expected_entry_count, "sanity check");
2358   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2359   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2360   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2361   if (start != nullptr) {
2362     assert(entries.length() == expected_entry_count - 1,
2363            "unexpected entry count %d", entries.length());
2364     assert(!add_handlers || extras.length() == expected_handler_count,
2365            "unexpected handler addresses count %d", extras.length());
2366     if (entry != nullptr) {
2367       *entry = entries.at(0);
2368     }
2369     if (add_handlers) {
2370       // restore 2 UMAM {start,end,handler} addresses from extras
2371       register_unsafe_access_handlers(extras, 0, 2);
2372     }
2373 #if INCLUDE_ZGC
2374     // register addresses at which ZGC does colour patching
2375     if (add_relocs)  {
2376       register_reloc_addresses(extras, 6, extras.length());
2377     }
2378 #endif // INCLUDE_ZGC
2379     return start;
2380   }
2381 
2382   __ align(CodeEntryAlignment);
2383   StubCodeMark mark(this, stub_id);
2384   start = __ pc();
2385 
2386   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2387   const Register from        = rdi;  // source array address
2388   const Register to          = rsi;  // destination array address
2389   const Register count       = rdx;  // elements count
2390   const Register dword_count = rcx;
2391   const Register qword_count = count;
2392 
2393   __ enter(); // required for proper stackwalking of RuntimeStub frame
2394   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2395 
2396   if (entry != nullptr) {
2397     *entry = __ pc();
2398     entries.append(*entry);
2399     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2400     BLOCK_COMMENT("Entry:");
2401   }
2402 
2403   array_overlap_test(nooverlap_target, Address::times_4);
2404   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2405                                  // r9 is used to save r15_thread
2406 
2407   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2408   if (dest_uninitialized) {
2409     decorators |= IS_DEST_UNINITIALIZED;
2410   }
2411   if (aligned) {
2412     decorators |= ARRAYCOPY_ALIGNED;
2413   }
2414 
2415   BasicType type = is_oop ? T_OBJECT : T_INT;
2416   // no registers are destroyed by this call
2417   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2418 
2419   assert_clean_int(count, rax); // Make sure 'count' is clean int.
2420   {
2421     // UnsafeMemoryAccess page error: continue after unsafe access
2422     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2423     // 'from', 'to' and 'count' are now valid
2424     __ movptr(dword_count, count);
2425     __ shrptr(count, 1); // count => qword_count
2426 
2427     // Copy from high to low addresses.  Use 'to' as scratch.
2428 
2429     // Check for and copy trailing dword
2430     __ testl(dword_count, 1);
2431     __ jcc(Assembler::zero, L_copy_bytes);
2432     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2433     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2434     __ jmp(L_copy_bytes);
2435 
2436     // Copy trailing qwords
2437     __ BIND(L_copy_8_bytes);
2438     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2439     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2440     __ decrement(qword_count);
2441     __ jcc(Assembler::notZero, L_copy_8_bytes);
2442   }
2443   if (is_oop) {
2444     __ jmp(L_exit);
2445   }
2446   restore_arg_regs_using_thread();
2447   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2448   __ xorptr(rax, rax); // return 0
2449   __ vzeroupper();
2450   __ leave(); // required for proper stackwalking of RuntimeStub frame
2451   __ ret(0);
2452 
2453   {
2454     // UnsafeMemoryAccess page error: continue after unsafe access
2455     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2456     // Copy in multi-bytes chunks
2457     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2458   }
2459 
2460   __ BIND(L_exit);
2461   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2462   restore_arg_regs_using_thread();
2463   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2464   __ xorptr(rax, rax); // return 0
2465   __ vzeroupper();
2466   __ leave(); // required for proper stackwalking of RuntimeStub frame
2467   __ ret(0);
2468 
2469   // retrieve the registered handler addresses
2470   address end = __ pc();
2471   if (add_handlers) {
2472     retrieve_unsafe_access_handlers(start, end, extras);
2473   }
2474   assert(extras.length() == expected_handler_count,
2475          "unexpected handler addresses count %d", extras.length());
2476 #if INCLUDE_ZGC
2477   // retrieve addresses at which ZGC does colour patching
2478   if (add_relocs) {
2479     retrieve_reloc_addresses(start, end, extras);
2480   }
2481 #endif // INCLUDE_ZGC
2482   // record the stub entry and end plus the no_push entry and any
2483   // extra handler addresses
2484   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2485 
2486   return start;
2487 }
2488 
2489 
2490 // Arguments:
2491 //   entry     - location for return of (post-push) entry
2492 //
2493 // Inputs:
2494 //   c_rarg0   - source array address
2495 //   c_rarg1   - destination array address
2496 //   c_rarg2   - element count, treated as ssize_t, can be zero
2497 //
2498  // Side Effects:
2499 //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2500 //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2501 //
2502 address StubGenerator::generate_disjoint_long_oop_copy(StubId stub_id, address *entry) {
2503   // aligned is always false -- x86_64 always uses the unaligned code
2504   const bool aligned = false;
2505   bool is_oop;
2506   bool dest_uninitialized;
2507   switch (stub_id) {
2508   case StubId::stubgen_jlong_disjoint_arraycopy_id:
2509     is_oop = false;
2510     dest_uninitialized = false;
2511     break;
2512   case StubId::stubgen_oop_disjoint_arraycopy_id:
2513     assert(!UseCompressedOops, "inconsistent oop copy size!");
2514     is_oop = true;
2515     dest_uninitialized = false;
2516     break;
2517   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2518     assert(!UseCompressedOops, "inconsistent oop copy size!");
2519     is_oop = true;
2520     dest_uninitialized = true;
2521     break;
2522   default:
2523     ShouldNotReachHere();
2524   }
2525 
2526   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2527 #ifdef COMPILER2
2528   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2529     return generate_disjoint_copy_avx3_masked(stub_id, entry);
2530   }
2531 #endif // COMPILER2
2532   bool add_handlers = !is_oop && !aligned;
2533   bool add_relocs = UseZGC && is_oop;
2534   bool add_extras = add_handlers || add_relocs;
2535   GrowableArray<address> entries;
2536   GrowableArray<address> extras;
2537   int expected_entry_count = (entry != nullptr ? 2 : 1);
2538   int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2539   int entry_count = StubInfo::entry_count(stub_id);
2540   assert(entry_count == expected_entry_count, "sanity check");
2541   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2542   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2543   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2544   if (start != nullptr) {
2545     assert(entries.length() == expected_entry_count - 1,
2546            "unexpected entry count %d", entries.length());
2547     assert(!add_handlers || extras.length() == expected_handler_count,
2548            "unexpected handler addresses count %d", extras.length());
2549     if (entry != nullptr) {
2550       *entry = entries.at(0);
2551     }
2552     if (add_handlers) {
2553       // restore 2 UMAM {start,end,handler} addresses from extras
2554       register_unsafe_access_handlers(extras, 0, 2);
2555     }
2556 #if INCLUDE_ZGC
2557     // register addresses at which ZGC does colour patching
2558     if (add_relocs)  {
2559       register_reloc_addresses(extras, 0, extras.length());
2560     }
2561 #endif // INCLUDE_ZGC
2562     return start;
2563   }
2564 
2565   __ align(CodeEntryAlignment);
2566   StubCodeMark mark(this, stub_id);
2567   start = __ pc();
2568 
2569   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2570   const Register from        = rdi;  // source array address
2571   const Register to          = rsi;  // destination array address
2572   const Register qword_count = rdx;  // elements count
2573   const Register end_from    = from; // source array end address
2574   const Register end_to      = rcx;  // destination array end address
2575   const Register saved_count = r11;
2576   // End pointers are inclusive, and if count is not zero they point
2577   // to the last unit copied:  end_to[0] := end_from[0]
2578 
2579   __ enter(); // required for proper stackwalking of RuntimeStub frame
2580   // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2581   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2582 
2583   if (entry != nullptr) {
2584     *entry = __ pc();
2585     entries.append(*entry);
2586     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2587     BLOCK_COMMENT("Entry:");
2588   }
2589 
2590   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2591                                    // r9 is used to save r15_thread
2592   // 'from', 'to' and 'qword_count' are now valid
2593 
2594   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2595   if (dest_uninitialized) {
2596     decorators |= IS_DEST_UNINITIALIZED;
2597   }
2598   if (aligned) {
2599     decorators |= ARRAYCOPY_ALIGNED;
2600   }
2601 
2602   BasicType type = is_oop ? T_OBJECT : T_LONG;
2603   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2604   {
2605     // UnsafeMemoryAccess page error: continue after unsafe access
2606     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2607 
2608     // Copy from low to high addresses.  Use 'to' as scratch.
2609     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2610     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2611     __ negptr(qword_count);
2612     __ jmp(L_copy_bytes);
2613 
2614     // Copy trailing qwords
2615   __ BIND(L_copy_8_bytes);
2616     bs->copy_load_at(_masm, decorators, type, 8,
2617                      rax, Address(end_from, qword_count, Address::times_8, 8),
2618                      r10);
2619     bs->copy_store_at(_masm, decorators, type, 8,
2620                       Address(end_to, qword_count, Address::times_8, 8), rax,
2621                       r10);
2622     __ increment(qword_count);
2623     __ jcc(Assembler::notZero, L_copy_8_bytes);
2624   }
2625   if (is_oop) {
2626     __ jmp(L_exit);
2627   } else {
2628     restore_arg_regs_using_thread();
2629     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2630     __ xorptr(rax, rax); // return 0
2631     __ vzeroupper();
2632     __ leave(); // required for proper stackwalking of RuntimeStub frame
2633     __ ret(0);
2634   }
2635 
2636   {
2637     // UnsafeMemoryAccess page error: continue after unsafe access
2638     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2639     // Copy in multi-bytes chunks
2640     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2641   }
2642 
2643   __ BIND(L_exit);
2644   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2645   restore_arg_regs_using_thread();
2646   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2647                           SharedRuntime::_jlong_array_copy_ctr,
2648                  rscratch1); // Update counter after rscratch1 is free
2649   __ vzeroupper();
2650   __ xorptr(rax, rax); // return 0
2651   __ leave(); // required for proper stackwalking of RuntimeStub frame
2652   __ ret(0);
2653 
2654   // retrieve the registered handler addresses
2655   address end = __ pc();
2656   if (add_handlers) {
2657     retrieve_unsafe_access_handlers(start, end, extras);
2658   }
2659   assert(extras.length() == expected_handler_count,
2660          "unexpected handler addresses count %d", extras.length());
2661 #if INCLUDE_ZGC
2662   // retrieve addresses at which ZGC does colour patching
2663   if (add_relocs) {
2664     retrieve_reloc_addresses(start, end, extras);
2665   }
2666 #endif // INCLUDE_ZGC
2667   // record the stub entry and end plus the no_push entry and any
2668   // extra handler addresses
2669   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2670 
2671   return start;
2672 }
2673 
2674 
2675 // Arguments:
2676 //   entry     - location for return of (post-push) entry
2677 //   nooverlap_target - entry to branch to if no overlap detected
2678 //   is_oop  - true => oop array, so generate store check code
2679 //
2680 // Inputs:
2681 //   c_rarg0   - source array address
2682 //   c_rarg1   - destination array address
2683 //   c_rarg2   - element count, treated as ssize_t, can be zero
2684 //
2685 address StubGenerator::generate_conjoint_long_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2686   // aligned is always false -- x86_64 always uses the unaligned code
2687   const bool aligned = false;
2688   bool is_oop;
2689   bool dest_uninitialized;
2690   switch (stub_id) {
2691   case StubId::stubgen_jlong_arraycopy_id:
2692     is_oop = false;
2693     dest_uninitialized = false;
2694     break;
2695   case StubId::stubgen_oop_arraycopy_id:
2696     assert(!UseCompressedOops, "inconsistent oop copy size!");
2697     is_oop = true;
2698     dest_uninitialized = false;
2699     break;
2700   case StubId::stubgen_oop_arraycopy_uninit_id:
2701     assert(!UseCompressedOops, "inconsistent oop copy size!");
2702     is_oop = true;
2703     dest_uninitialized = true;
2704     break;
2705   default:
2706     ShouldNotReachHere();
2707   }
2708 
2709   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2710 #ifdef COMPILER2
2711   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2712     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2713   }
2714 #endif // COMPILER2
2715   bool add_handlers = !is_oop && !aligned;
2716   bool add_relocs = UseZGC && is_oop;
2717   bool add_extras = add_handlers || add_relocs;
2718   GrowableArray<address> entries;
2719   GrowableArray<address> extras;
2720   int expected_entry_count = (entry != nullptr ? 2 : 1);
2721   int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2722   int entry_count = StubInfo::entry_count(stub_id);
2723   assert(entry_count == expected_entry_count, "sanity check");
2724   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2725   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2726   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2727   if (start != nullptr) {
2728     assert(entries.length() == expected_entry_count - 1,
2729            "unexpected entry count %d", entries.length());
2730     assert(!add_handlers || extras.length() == expected_handler_count,
2731            "unexpected handler addresses count %d", extras.length());
2732     if (entry != nullptr) {
2733       *entry = entries.at(0);
2734     }
2735     if (add_handlers) {
2736       // restore 2 UMAM {start,end,handler} addresses from extras
2737       register_unsafe_access_handlers(extras, 0, 2);
2738     }
2739 #if INCLUDE_ZGC
2740     // register addresses at which ZGC does colour patching
2741     if (add_relocs)  {
2742       register_reloc_addresses(extras, 0, extras.length());
2743     }
2744 #endif // INCLUDE_ZGC
2745     return start;
2746   }
2747 
2748   __ align(CodeEntryAlignment);
2749   StubCodeMark mark(this, stub_id);
2750   start = __ pc();
2751 
2752   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2753   const Register from        = rdi;  // source array address
2754   const Register to          = rsi;  // destination array address
2755   const Register qword_count = rdx;  // elements count
2756   const Register saved_count = rcx;
2757 
2758   __ enter(); // required for proper stackwalking of RuntimeStub frame
2759   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2760 
2761   if (entry != nullptr) {
2762     *entry = __ pc();
2763     entries.append(*entry);
2764     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2765     BLOCK_COMMENT("Entry:");
2766   }
2767 
2768   array_overlap_test(nooverlap_target, Address::times_8);
2769   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2770                                  // r9 is used to save r15_thread
2771   // 'from', 'to' and 'qword_count' are now valid
2772 
2773   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2774   if (dest_uninitialized) {
2775     decorators |= IS_DEST_UNINITIALIZED;
2776   }
2777   if (aligned) {
2778     decorators |= ARRAYCOPY_ALIGNED;
2779   }
2780 
2781   BasicType type = is_oop ? T_OBJECT : T_LONG;
2782   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2783   {
2784     // UnsafeMemoryAccess page error: continue after unsafe access
2785     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2786 
2787     __ jmp(L_copy_bytes);
2788 
2789     // Copy trailing qwords
2790   __ BIND(L_copy_8_bytes);
2791     bs->copy_load_at(_masm, decorators, type, 8,
2792                      rax, Address(from, qword_count, Address::times_8, -8),
2793                      r10);
2794     bs->copy_store_at(_masm, decorators, type, 8,
2795                       Address(to, qword_count, Address::times_8, -8), rax,
2796                       r10);
2797     __ decrement(qword_count);
2798     __ jcc(Assembler::notZero, L_copy_8_bytes);
2799   }
2800   if (is_oop) {
2801     __ jmp(L_exit);
2802   } else {
2803     restore_arg_regs_using_thread();
2804     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2805     __ xorptr(rax, rax); // return 0
2806     __ vzeroupper();
2807     __ leave(); // required for proper stackwalking of RuntimeStub frame
2808     __ ret(0);
2809   }
2810   {
2811     // UnsafeMemoryAccess page error: continue after unsafe access
2812     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2813 
2814     // Copy in multi-bytes chunks
2815     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2816   }
2817   __ BIND(L_exit);
2818   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2819   restore_arg_regs_using_thread();
2820   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2821                           SharedRuntime::_jlong_array_copy_ctr,
2822                  rscratch1); // Update counter after rscratch1 is free
2823   __ vzeroupper();
2824   __ xorptr(rax, rax); // return 0
2825   __ leave(); // required for proper stackwalking of RuntimeStub frame
2826   __ ret(0);
2827 
2828 
2829   // retrieve the registered handler addresses
2830   address end = __ pc();
2831   if (add_handlers) {
2832     retrieve_unsafe_access_handlers(start, end, extras);
2833   }
2834   assert(extras.length() == expected_handler_count,
2835          "unexpected handler addresses count %d", extras.length());
2836 #if INCLUDE_ZGC
2837   // retrieve addresses at which ZGC does colour patching
2838   if ((UseZGC && is_oop)) {
2839     retrieve_reloc_addresses(start, end, extras);
2840   }
2841 #endif // INCLUDE_ZGC
2842   // record the stub entry and end plus the no_push entry and any
2843   // extra handler addresses
2844   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2845 
2846   return start;
2847 }
2848 
2849 
2850 // Helper for generating a dynamic type check.
2851 // Smashes no registers.
2852 void StubGenerator::generate_type_check(Register sub_klass,
2853                                         Register super_check_offset,
2854                                         Register super_klass,
2855                                         Label& L_success) {
2856   assert_different_registers(sub_klass, super_check_offset, super_klass);
2857 
2858   BLOCK_COMMENT("type_check:");
2859 
2860   Label L_miss;
2861 
2862   __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
2863                                    super_check_offset);
2864   __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2865 
2866   // Fall through on failure!
2867   __ BIND(L_miss);
2868 }
2869 
2870 //
2871 //  Generate checkcasting array copy stub
2872 //
2873 //  Input:
2874 //    c_rarg0   - source array address
2875 //    c_rarg1   - destination array address
2876 //    c_rarg2   - element count, treated as ssize_t, can be zero
2877 //    c_rarg3   - size_t ckoff (super_check_offset)
2878 // not Win64
2879 //    c_rarg4   - oop ckval (super_klass)
2880 // Win64
2881 //    rsp+40    - oop ckval (super_klass)
2882 //
2883 //  Output:
2884 //    rax ==  0  -  success
2885 //    rax == -1^K - failure, where K is partial transfer count
2886 //
2887 address StubGenerator::generate_checkcast_copy(StubId stub_id, address *entry) {
2888 
2889   bool dest_uninitialized;
2890   switch (stub_id) {
2891   case StubId::stubgen_checkcast_arraycopy_id:
2892     dest_uninitialized = false;
2893     break;
2894   case StubId::stubgen_checkcast_arraycopy_uninit_id:
2895     dest_uninitialized = true;
2896     break;
2897   default:
2898     ShouldNotReachHere();
2899   }
2900 
2901   GrowableArray<address> entries;
2902   GrowableArray<address> extras;
2903   int expected_entry_count = (entry != nullptr ? 2 : 1);
2904   int entry_count = StubInfo::entry_count(stub_id);
2905   assert(entry_count == expected_entry_count, "sanity check");
2906   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2907   GrowableArray<address>* extras_ptr = (UseZGC ? &extras : nullptr);
2908   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2909   if (start != nullptr) {
2910     assert(entries.length() == expected_entry_count - 1,
2911            "unexpected addresses count %d", entries.length());
2912     if (entry != nullptr) {
2913       *entry = entries.at(0);
2914     }
2915 #if INCLUDE_ZGC
2916     if (UseZGC)  {
2917       register_reloc_addresses(extras, 0, extras.length());
2918     }
2919 #endif // INCLUDE_ZGC
2920     return start;
2921   }
2922 
2923   Label L_load_element, L_store_element, L_do_card_marks, L_done;
2924 
2925   // Input registers (after setup_arg_regs)
2926   const Register from        = rdi;   // source array address
2927   const Register to          = rsi;   // destination array address
2928   const Register length      = rdx;   // elements count
2929   const Register ckoff       = rcx;   // super_check_offset
2930   const Register ckval       = r8;    // super_klass
2931 
2932   // Registers used as temps (r13, r14 are save-on-entry)
2933   const Register end_from    = from;  // source array end address
2934   const Register end_to      = r13;   // destination array end address
2935   const Register count       = rdx;   // -(count_remaining)
2936   const Register r14_length  = r14;   // saved copy of length
2937   // End pointers are inclusive, and if length is not zero they point
2938   // to the last unit copied:  end_to[0] := end_from[0]
2939 
2940   const Register rax_oop    = rax;    // actual oop copied
2941   const Register r11_klass  = r11;    // oop._klass
2942 
2943   //---------------------------------------------------------------
2944   // Assembler stub will be used for this call to arraycopy
2945   // if the two arrays are subtypes of Object[] but the
2946   // destination array type is not equal to or a supertype
2947   // of the source type.  Each element must be separately
2948   // checked.
2949 
2950   __ align(CodeEntryAlignment);
2951   StubCodeMark mark(this, stub_id);
2952   start = __ pc();
2953 
2954   __ enter(); // required for proper stackwalking of RuntimeStub frame
2955 
2956 #ifdef ASSERT
2957   // caller guarantees that the arrays really are different
2958   // otherwise, we would have to make conjoint checks
2959   { Label L;
2960     array_overlap_test(L, TIMES_OOP);
2961     __ stop("checkcast_copy within a single array");
2962     __ bind(L);
2963   }
2964 #endif //ASSERT
2965 
2966   setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2967                                   // ckoff => rcx, ckval => r8
2968                                   // r9 is used to save r15_thread
2969 #ifdef _WIN64
2970   // last argument (#4) is on stack on Win64
2971   __ movptr(ckval, Address(rsp, 6 * wordSize));
2972 #endif
2973 
2974   // Caller of this entry point must set up the argument registers.
2975   if (entry != nullptr) {
2976     *entry = __ pc();
2977     entries.append(*entry);
2978     BLOCK_COMMENT("Entry:");
2979   }
2980 
2981   // allocate spill slots for r13, r14
2982   enum {
2983     saved_r13_offset,
2984     saved_r14_offset,
2985     saved_r10_offset,
2986     saved_rbp_offset
2987   };
2988   __ subptr(rsp, saved_rbp_offset * wordSize);
2989   __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2990   __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2991   __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2992 
2993 #ifdef ASSERT
2994     Label L2;
2995     __ get_thread_slow(r14);
2996     __ cmpptr(r15_thread, r14);
2997     __ jcc(Assembler::equal, L2);
2998     __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2999     __ bind(L2);
3000 #endif // ASSERT
3001 
3002   // check that int operands are properly extended to size_t
3003   assert_clean_int(length, rax);
3004   assert_clean_int(ckoff, rax);
3005 
3006 #ifdef ASSERT
3007   BLOCK_COMMENT("assert consistent ckoff/ckval");
3008   // The ckoff and ckval must be mutually consistent,
3009   // even though caller generates both.
3010   { Label L;
3011     int sco_offset = in_bytes(Klass::super_check_offset_offset());
3012     __ cmpl(ckoff, Address(ckval, sco_offset));
3013     __ jcc(Assembler::equal, L);
3014     __ stop("super_check_offset inconsistent");
3015     __ bind(L);
3016   }
3017 #endif //ASSERT
3018 
3019   // Loop-invariant addresses.  They are exclusive end pointers.
3020   Address end_from_addr(from, length, TIMES_OOP, 0);
3021   Address   end_to_addr(to,   length, TIMES_OOP, 0);
3022   // Loop-variant addresses.  They assume post-incremented count < 0.
3023   Address from_element_addr(end_from, count, TIMES_OOP, 0);
3024   Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
3025 
3026   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
3027   if (dest_uninitialized) {
3028     decorators |= IS_DEST_UNINITIALIZED;
3029   }
3030 
3031   BasicType type = T_OBJECT;
3032   size_t element_size = UseCompressedOops ? 4 : 8;
3033 
3034   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3035   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
3036 
3037   // Copy from low to high addresses, indexed from the end of each array.
3038   __ lea(end_from, end_from_addr);
3039   __ lea(end_to,   end_to_addr);
3040   __ movptr(r14_length, length);        // save a copy of the length
3041   assert(length == count, "");          // else fix next line:
3042   __ negptr(count);                     // negate and test the length
3043   __ jcc(Assembler::notZero, L_load_element);
3044 
3045   // Empty array:  Nothing to do.
3046   __ xorptr(rax, rax);                  // return 0 on (trivial) success
3047   __ jmp(L_done);
3048 
3049   // ======== begin loop ========
3050   // (Loop is rotated; its entry is L_load_element.)
3051   // Loop control:
3052   //   for (count = -count; count != 0; count++)
3053   // Base pointers src, dst are biased by 8*(count-1),to last element.
3054   __ align(OptoLoopAlignment);
3055 
3056   __ BIND(L_store_element);
3057   bs->copy_store_at(_masm,
3058                     decorators,
3059                     type,
3060                     element_size,
3061                     to_element_addr,
3062                     rax_oop,
3063                     r10);
3064   __ increment(count);               // increment the count toward zero
3065   __ jcc(Assembler::zero, L_do_card_marks);
3066 
3067   // ======== loop entry is here ========
3068   __ BIND(L_load_element);
3069   bs->copy_load_at(_masm,
3070                    decorators,
3071                    type,
3072                    element_size,
3073                    rax_oop,
3074                    from_element_addr,
3075                    r10);
3076   __ testptr(rax_oop, rax_oop);
3077   __ jcc(Assembler::zero, L_store_element);
3078 
3079   __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
3080   generate_type_check(r11_klass, ckoff, ckval, L_store_element);
3081   // ======== end loop ========
3082 
3083   // It was a real error; we must depend on the caller to finish the job.
3084   // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
3085   // Emit GC store barriers for the oops we have copied (r14 + rdx),
3086   // and report their number to the caller.
3087   assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
3088   Label L_post_barrier;
3089   __ addptr(r14_length, count);     // K = (original - remaining) oops
3090   __ movptr(rax, r14_length);       // save the value
3091   __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
3092   __ jccb(Assembler::notZero, L_post_barrier);
3093   __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
3094 
3095   // Come here on success only.
3096   __ BIND(L_do_card_marks);
3097   __ xorptr(rax, rax);              // return 0 on success
3098 
3099   __ BIND(L_post_barrier);
3100   bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
3101 
3102   // Common exit point (success or failure).
3103   __ BIND(L_done);
3104   __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
3105   __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
3106   __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
3107   restore_arg_regs_using_thread();
3108   INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
3109   __ leave(); // required for proper stackwalking of RuntimeStub frame
3110   __ ret(0);
3111 
3112   address end = __ pc();
3113 #if INCLUDE_ZGC
3114   // retrieve addresses at which ZGC does colour patching
3115   if (UseZGC) {
3116     retrieve_reloc_addresses(start, end, extras);
3117   }
3118 #endif // INCLUDE_ZGC
3119   // record the stub entry and end plus the no_push entry
3120     store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
3121 
3122   return start;
3123 }
3124 
3125 
3126 //  Generate 'unsafe' array copy stub
3127 //  Though just as safe as the other stubs, it takes an unscaled
3128 //  size_t argument instead of an element count.
3129 //
3130 //  Input:
3131 //    c_rarg0   - source array address
3132 //    c_rarg1   - destination array address
3133 //    c_rarg2   - byte count, treated as ssize_t, can be zero
3134 //
3135 // Examines the alignment of the operands and dispatches
3136 // to a long, int, short, or byte copy loop.
3137 //
3138 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
3139                                             address int_copy_entry, address long_copy_entry) {
3140 
3141   StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
3142   int entry_count = StubInfo::entry_count(stub_id);
3143   assert(entry_count == 1, "sanity check");
3144   address start = load_archive_data(stub_id);
3145   if (start != nullptr) {
3146     return start;
3147   }
3148 
3149   Label L_long_aligned, L_int_aligned, L_short_aligned;
3150 
3151   // Input registers (before setup_arg_regs)
3152   const Register from        = c_rarg0;  // source array address
3153   const Register to          = c_rarg1;  // destination array address
3154   const Register size        = c_rarg2;  // byte count (size_t)
3155 
3156   // Register used as a temp
3157   const Register bits        = rax;      // test copy of low bits
3158 
3159   __ align(CodeEntryAlignment);
3160   StubCodeMark mark(this, stub_id);
3161   start = __ pc();
3162 
3163   __ enter(); // required for proper stackwalking of RuntimeStub frame
3164 
3165   // bump this on entry, not on exit:
3166   INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
3167 
3168   __ mov(bits, from);
3169   __ orptr(bits, to);
3170   __ orptr(bits, size);
3171 
3172   __ testb(bits, BytesPerLong-1);
3173   __ jccb(Assembler::zero, L_long_aligned);
3174 
3175   __ testb(bits, BytesPerInt-1);
3176   __ jccb(Assembler::zero, L_int_aligned);
3177 
3178   __ testb(bits, BytesPerShort-1);
3179   __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
3180 
3181   __ BIND(L_short_aligned);
3182   __ shrptr(size, LogBytesPerShort); // size => short_count
3183   __ jump(RuntimeAddress(short_copy_entry));
3184 
3185   __ BIND(L_int_aligned);
3186   __ shrptr(size, LogBytesPerInt); // size => int_count
3187   __ jump(RuntimeAddress(int_copy_entry));
3188 
3189   __ BIND(L_long_aligned);
3190   __ shrptr(size, LogBytesPerLong); // size => qword_count
3191   __ jump(RuntimeAddress(long_copy_entry));
3192 
3193   // record the stub entry and end plus
3194   store_archive_data(stub_id, start, __ pc());
3195 
3196   return start;
3197 }
3198 
3199 
3200 // Static enum for helper
3201 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
3202 // Helper for generate_unsafe_setmemory
3203 //
3204 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
3205 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
3206                                      Register size, Register wide_value,
3207                                      Register tmp, Label& L_exit,
3208                                      MacroAssembler *_masm) {
3209   Label L_Loop, L_Tail, L_TailLoop;
3210 
3211   int shiftval = 0;
3212   int incr = 0;
3213 
3214   switch (type) {
3215     case USM_SHORT:
3216       shiftval = 1;
3217       incr = 16;
3218       break;
3219     case USM_DWORD:
3220       shiftval = 2;
3221       incr = 32;
3222       break;
3223     case USM_QUADWORD:
3224       shiftval = 3;
3225       incr = 64;
3226       break;
3227   }
3228 
3229   // At this point, we know the lower bits of size are zero
3230   __ shrq(size, shiftval);
3231   // size now has number of X-byte chunks (2, 4 or 8)
3232 
3233   // Number of (8*X)-byte chunks into tmp
3234   __ movq(tmp, size);
3235   __ shrq(tmp, 3);
3236   __ jccb(Assembler::zero, L_Tail);
3237 
3238   __ BIND(L_Loop);
3239 
3240   // Unroll 8 stores
3241   for (int i = 0; i < 8; i++) {
3242     switch (type) {
3243       case USM_SHORT:
3244         __ movw(Address(dest, (2 * i)), wide_value);
3245         break;
3246       case USM_DWORD:
3247         __ movl(Address(dest, (4 * i)), wide_value);
3248         break;
3249       case USM_QUADWORD:
3250         __ movq(Address(dest, (8 * i)), wide_value);
3251         break;
3252     }
3253   }
3254   __ addq(dest, incr);
3255   __ decrementq(tmp);
3256   __ jccb(Assembler::notZero, L_Loop);
3257 
3258   __ BIND(L_Tail);
3259 
3260   // Find number of remaining X-byte chunks
3261   __ andq(size, 0x7);
3262 
3263   // If zero, then we're done
3264   __ jccb(Assembler::zero, L_exit);
3265 
3266   __ BIND(L_TailLoop);
3267 
3268     switch (type) {
3269       case USM_SHORT:
3270         __ movw(Address(dest, 0), wide_value);
3271         break;
3272       case USM_DWORD:
3273         __ movl(Address(dest, 0), wide_value);
3274         break;
3275       case USM_QUADWORD:
3276         __ movq(Address(dest, 0), wide_value);
3277         break;
3278     }
3279   __ addq(dest, incr >> 3);
3280   __ decrementq(size);
3281   __ jccb(Assembler::notZero, L_TailLoop);
3282 }
3283 
3284 //  Generate 'unsafe' set memory stub
3285 //  Though just as safe as the other stubs, it takes an unscaled
3286 //  size_t (# bytes) argument instead of an element count.
3287 //
3288 //  Input:
3289 //    c_rarg0   - destination array address
3290 //    c_rarg1   - byte count (size_t)
3291 //    c_rarg2   - byte value
3292 //
3293 // Examines the alignment of the operands and dispatches
3294 // to an int, short, or byte fill loop.
3295 //
3296 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) {
3297   StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
3298   int entry_count = StubInfo::entry_count(stub_id);
3299   assert(entry_count == 1, "sanity check");
3300   // we expect three set of extra unsafememory access handler entries
3301   GrowableArray<address> extras;
3302   int expected_handler_count = 3 * UnsafeMemoryAccess::COLUMN_COUNT;
3303   address start = load_archive_data(stub_id, nullptr, &extras);
3304   if (start != nullptr) {
3305     assert(extras.length() == expected_handler_count,
3306            "unexpected handler addresses count %d", extras.length());
3307     register_unsafe_access_handlers(extras, 0, 3);
3308     return start;
3309   }
3310 
3311   __ align(CodeEntryAlignment);
3312   StubCodeMark mark(this, stub_id);
3313   start = __ pc();
3314   __ enter();   // required for proper stackwalking of RuntimeStub frame
3315 
3316   assert(unsafe_byte_fill != nullptr, "Invalid call");
3317 
3318   // bump this on entry, not on exit:
3319   INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
3320 
3321   {
3322     Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
3323 
3324     const Register dest = c_rarg0;
3325     const Register size = c_rarg1;
3326     const Register byteVal = c_rarg2;
3327     const Register wide_value = rax;
3328     const Register rScratch1 = r10;
3329 
3330     assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
3331 
3332     //     fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
3333 
3334     __ testq(size, size);
3335     __ jcc(Assembler::zero, L_exit);
3336 
3337     // Propagate byte to full Register
3338     __ movzbl(rScratch1, byteVal);
3339     __ mov64(wide_value, 0x0101010101010101ULL);
3340     __ imulq(wide_value, rScratch1);
3341 
3342     // Check for pointer & size alignment
3343     __ movq(rScratch1, dest);
3344     __ orq(rScratch1, size);
3345 
3346     __ testb(rScratch1, 7);
3347     __ jcc(Assembler::equal, L_fillQuadwords);
3348 
3349     __ testb(rScratch1, 3);
3350     __ jcc(Assembler::equal, L_fillDwords);
3351 
3352     __ testb(rScratch1, 1);
3353     __ jcc(Assembler::notEqual, L_fillBytes);
3354 
3355     // Fill words
3356     {
3357       UnsafeMemoryAccessMark umam(this, true, true);
3358 
3359       // At this point, we know the lower bit of size is zero and a
3360       // multiple of 2
3361       do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
3362                                L_exit, _masm);
3363     }
3364     __ jmpb(L_exit);
3365 
3366     __ BIND(L_fillQuadwords);
3367 
3368     // Fill QUADWORDs
3369     {
3370       UnsafeMemoryAccessMark umam(this, true, true);
3371 
3372       // At this point, we know the lower 3 bits of size are zero and a
3373       // multiple of 8
3374       do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
3375                                L_exit, _masm);
3376     }
3377     __ BIND(L_exit);
3378 
3379     __ leave();   // required for proper stackwalking of RuntimeStub frame
3380     __ ret(0);
3381 
3382     __ BIND(L_fillDwords);
3383 
3384     // Fill DWORDs
3385     {
3386       UnsafeMemoryAccessMark umam(this, true, true);
3387 
3388       // At this point, we know the lower 2 bits of size are zero and a
3389       // multiple of 4
3390       do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
3391                                L_exit, _masm);
3392     }
3393     __ jmpb(L_exit);
3394 
3395     __ BIND(L_fillBytes);
3396     // Set up for tail call to previously generated byte fill routine
3397     // Parameter order is (ptr, byteVal, size)
3398     __ xchgq(c_rarg1, c_rarg2);
3399     __ leave();    // Clear effect of enter()
3400     __ jump(RuntimeAddress(unsafe_byte_fill));
3401   }
3402 
3403   // retrieve the registered handler addresses
3404   address end = __ pc();
3405   retrieve_unsafe_access_handlers(start, end, extras);
3406   assert(extras.length() == expected_handler_count,
3407          "unexpected handler addresses count %d", extras.length());
3408 
3409   // record the stub entry and end plus the no_push entry and any
3410   // extra handler addresses
3411   store_archive_data(stub_id, start, end, nullptr, &extras);
3412 
3413   return start;
3414 }
3415 
3416 // Perform range checks on the proposed arraycopy.
3417 // Kills temp, but nothing else.
3418 // Also, clean the sign bits of src_pos and dst_pos.
3419 void StubGenerator::arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
3420                                            Register src_pos, // source position (c_rarg1)
3421                                            Register dst,     // destination array oo (c_rarg2)
3422                                            Register dst_pos, // destination position (c_rarg3)
3423                                            Register length,
3424                                            Register temp,
3425                                            Label& L_failed) {
3426   BLOCK_COMMENT("arraycopy_range_checks:");
3427 
3428   //  if (src_pos + length > arrayOop(src)->length())  FAIL;
3429   __ movl(temp, length);
3430   __ addl(temp, src_pos);             // src_pos + length
3431   __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
3432   __ jcc(Assembler::above, L_failed);
3433 
3434   //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
3435   __ movl(temp, length);
3436   __ addl(temp, dst_pos);             // dst_pos + length
3437   __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
3438   __ jcc(Assembler::above, L_failed);
3439 
3440   // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
3441   // Move with sign extension can be used since they are positive.
3442   __ movslq(src_pos, src_pos);
3443   __ movslq(dst_pos, dst_pos);
3444 
3445   BLOCK_COMMENT("arraycopy_range_checks done");
3446 }
3447 
3448 
3449 //  Generate generic array copy stubs
3450 //
3451 //  Input:
3452 //    c_rarg0    -  src oop
3453 //    c_rarg1    -  src_pos (32-bits)
3454 //    c_rarg2    -  dst oop
3455 //    c_rarg3    -  dst_pos (32-bits)
3456 // not Win64
3457 //    c_rarg4    -  element count (32-bits)
3458 // Win64
3459 //    rsp+40     -  element count (32-bits)
3460 //
3461 //  Output:
3462 //    rax ==  0  -  success
3463 //    rax == -1^K - failure, where K is partial transfer count
3464 //
3465 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry,
3466                                              address int_copy_entry, address oop_copy_entry,
3467                                              address long_copy_entry, address checkcast_copy_entry) {
3468 
3469   StubId stub_id = StubId::stubgen_generic_arraycopy_id;
3470   int entry_count = StubInfo::entry_count(stub_id);
3471   assert(entry_count == 1, "sanity check");
3472   address start = load_archive_data(stub_id);
3473   if (start != nullptr) {
3474     return start;
3475   }
3476 
3477   Label L_failed, L_failed_0, L_skip_failed_0, L_objArray;
3478   Label L_copy_shorts, L_copy_ints, L_copy_longs;
3479 
3480   // Input registers
3481   const Register src        = c_rarg0;  // source array oop
3482   const Register src_pos    = c_rarg1;  // source position
3483   const Register dst        = c_rarg2;  // destination array oop
3484   const Register dst_pos    = c_rarg3;  // destination position
3485 #ifndef _WIN64
3486   const Register length     = c_rarg4;
3487   const Register rklass_tmp = r9;  // load_klass
3488 #else
3489   const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
3490   const Register rklass_tmp = rdi;  // load_klass
3491 #endif
3492 
3493   StubCodeMark mark(this, stub_id);
3494   __ align(CodeEntryAlignment);
3495   start = __ pc();
3496 
3497   __ enter(); // required for proper stackwalking of RuntimeStub frame
3498 
3499 #ifdef _WIN64
3500   __ push_ppx(rklass_tmp); // rdi is callee-save on Windows
3501 #endif
3502 
3503   // bump this on entry, not on exit:
3504   INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
3505 
3506   //-----------------------------------------------------------------------
3507   // Assembler stub will be used for this call to arraycopy
3508   // if the following conditions are met:
3509   //
3510   // (1) src and dst must not be null.
3511   // (2) src_pos must not be negative.
3512   // (3) dst_pos must not be negative.
3513   // (4) length  must not be negative.
3514   // (5) src klass and dst klass should be the same and not null.
3515   // (6) src and dst should be arrays.
3516   // (7) src_pos + length must not exceed length of src.
3517   // (8) dst_pos + length must not exceed length of dst.
3518   //
3519 
3520   //  if (src == nullptr) return -1;
3521   __ testptr(src, src);         // src oop
3522   size_t j1off = __ offset();
3523   __ jccb(Assembler::zero, L_failed_0);
3524 
3525   //  if (src_pos < 0) return -1;
3526   __ testl(src_pos, src_pos); // src_pos (32-bits)
3527   __ jccb(Assembler::negative, L_failed_0);
3528 
3529   //  if (dst == nullptr) return -1;
3530   __ testptr(dst, dst);         // dst oop
3531   __ jccb(Assembler::zero, L_failed_0);
3532 
3533   //  if (dst_pos < 0) return -1;
3534   __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3535   size_t j4off = __ offset();
3536   // skip over the failure trampoline
3537   __ jccb(Assembler::positive, L_skip_failed_0);
3538 
3539   // The first four tests are very dense code,
3540   // but not quite dense enough to put four
3541   // jumps in a 16-byte instruction fetch buffer.
3542   // That's good, because some branch predicters
3543   // do not like jumps so close together.
3544   // Make sure of this.
3545   guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3546 
3547   // Short-hop target to L_failed.  Makes for denser prologue code.
3548   __ BIND(L_failed_0);
3549   __ jmp(L_failed);
3550 
3551   // continue here if first 4 checks pass
3552   __ bind(L_skip_failed_0);
3553 
3554   // registers used as temp
3555   const Register r11_length    = r11; // elements count to copy
3556   const Register r10_src_klass = r10; // array klass
3557 
3558   //  if (length < 0) return -1;
3559   __ movl(r11_length, length);        // length (elements count, 32-bits value)
3560   __ testl(r11_length, r11_length);
3561   __ jccb(Assembler::negative, L_failed_0);
3562 
3563   __ load_klass(r10_src_klass, src, rklass_tmp);
3564 #ifdef ASSERT
3565   //  assert(src->klass() != nullptr);
3566   {
3567     BLOCK_COMMENT("assert klasses not null {");
3568     Label L1, L2;
3569     __ testptr(r10_src_klass, r10_src_klass);
3570     __ jcc(Assembler::notZero, L2);   // it is broken if klass is null
3571     __ bind(L1);
3572     __ stop("broken null klass");
3573     __ bind(L2);
3574     __ load_klass(rax, dst, rklass_tmp);
3575     __ cmpq(rax, 0);
3576     __ jcc(Assembler::equal, L1);     // this would be broken also
3577     BLOCK_COMMENT("} assert klasses not null done");
3578   }
3579 #endif
3580 
3581   // Load layout helper (32-bits)
3582   //
3583   //  |array_tag|     | header_size | element_type |     |log2_element_size|
3584   // 32        30    24            16              8     2                 0
3585   //
3586   //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3587   //
3588 
3589   const int lh_offset = in_bytes(Klass::layout_helper_offset());
3590 
3591   // Handle objArrays completely differently...
3592   const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3593   __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3594   __ jcc(Assembler::equal, L_objArray);
3595 
3596   //  if (src->klass() != dst->klass()) return -1;
3597   __ load_klass(rax, dst, rklass_tmp);
3598   __ cmpq(r10_src_klass, rax);
3599   __ jcc(Assembler::notEqual, L_failed);
3600 
3601   // Check for flat inline type array -> return -1
3602   __ test_flat_array_oop(src, rax, L_failed);
3603 
3604   // Check for null-free (non-flat) inline type array -> handle as object array
3605   __ test_null_free_array_oop(src, rax, L_objArray);
3606 
3607   const Register rax_lh = rax;  // layout helper
3608   __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3609 
3610   // Check for flat inline type array -> return -1
3611   __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace);
3612   __ jcc(Assembler::notZero, L_failed);
3613 
3614   //  if (!src->is_Array()) return -1;
3615   __ cmpl(rax_lh, Klass::_lh_neutral_value);
3616   __ jcc(Assembler::greaterEqual, L_failed);
3617 
3618   // At this point, it is known to be a typeArray (array_tag 0x3).
3619 #ifdef ASSERT
3620   {
3621     BLOCK_COMMENT("assert primitive array {");
3622     Label L;
3623     __ movl(rklass_tmp, rax_lh);
3624     __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
3625     __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
3626     __ jcc(Assembler::equal, L);
3627     __ stop("must be a primitive array");
3628     __ bind(L);
3629     BLOCK_COMMENT("} assert primitive array done");
3630   }
3631 #endif
3632 
3633   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3634                          r10, L_failed);
3635 
3636   // TypeArrayKlass
3637   //
3638   // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3639   // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3640   //
3641 
3642   const Register r10_offset = r10;    // array offset
3643   const Register rax_elsize = rax_lh; // element size
3644 
3645   __ movl(r10_offset, rax_lh);
3646   __ shrl(r10_offset, Klass::_lh_header_size_shift);
3647   __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3648   __ addptr(src, r10_offset);           // src array offset
3649   __ addptr(dst, r10_offset);           // dst array offset
3650   BLOCK_COMMENT("choose copy loop based on element size");
3651   __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3652 
3653 #ifdef _WIN64
3654   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3655 #endif
3656 
3657   // next registers should be set before the jump to corresponding stub
3658   const Register from     = c_rarg0;  // source array address
3659   const Register to       = c_rarg1;  // destination array address
3660   const Register count    = c_rarg2;  // elements count
3661 
3662   // 'from', 'to', 'count' registers should be set in such order
3663   // since they are the same as 'src', 'src_pos', 'dst'.
3664 
3665   __ cmpl(rax_elsize, 0);
3666   __ jccb(Assembler::notEqual, L_copy_shorts);
3667   __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3668   __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3669   __ movl2ptr(count, r11_length); // length
3670   __ jump(RuntimeAddress(byte_copy_entry));
3671 
3672 __ BIND(L_copy_shorts);
3673   __ cmpl(rax_elsize, LogBytesPerShort);
3674   __ jccb(Assembler::notEqual, L_copy_ints);
3675   __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3676   __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3677   __ movl2ptr(count, r11_length); // length
3678   __ jump(RuntimeAddress(short_copy_entry));
3679 
3680 __ BIND(L_copy_ints);
3681   __ cmpl(rax_elsize, LogBytesPerInt);
3682   __ jccb(Assembler::notEqual, L_copy_longs);
3683   __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3684   __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3685   __ movl2ptr(count, r11_length); // length
3686   __ jump(RuntimeAddress(int_copy_entry));
3687 
3688 __ BIND(L_copy_longs);
3689 #ifdef ASSERT
3690   {
3691     BLOCK_COMMENT("assert long copy {");
3692     Label L;
3693     __ cmpl(rax_elsize, LogBytesPerLong);
3694     __ jcc(Assembler::equal, L);
3695     __ stop("must be long copy, but elsize is wrong");
3696     __ bind(L);
3697     BLOCK_COMMENT("} assert long copy done");
3698   }
3699 #endif
3700   __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3701   __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3702   __ movl2ptr(count, r11_length); // length
3703   __ jump(RuntimeAddress(long_copy_entry));
3704 
3705   // ObjArrayKlass
3706 __ BIND(L_objArray);
3707   // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3708 
3709   Label L_plain_copy, L_checkcast_copy;
3710   //  test array classes for subtyping
3711   __ load_klass(rax, dst, rklass_tmp);
3712   __ cmpq(r10_src_klass, rax); // usual case is exact equality
3713   __ jcc(Assembler::notEqual, L_checkcast_copy);
3714 
3715   // Identically typed arrays can be copied without element-wise checks.
3716   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3717                          r10, L_failed);
3718 
3719   __ lea(from, Address(src, src_pos, TIMES_OOP,
3720                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3721   __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3722                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3723   __ movl2ptr(count, r11_length); // length
3724 __ BIND(L_plain_copy);
3725 #ifdef _WIN64
3726   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3727 #endif
3728   __ jump(RuntimeAddress(oop_copy_entry));
3729 
3730 __ BIND(L_checkcast_copy);
3731   // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3732   {
3733     // Before looking at dst.length, make sure dst is also an objArray.
3734     // This check also fails for flat arrays which are not supported.
3735     __ cmpl(Address(rax, lh_offset), objArray_lh);
3736     __ jcc(Assembler::notEqual, L_failed);
3737 
3738 #ifdef ASSERT
3739     {
3740       BLOCK_COMMENT("assert not null-free array {");
3741       Label L;
3742       __ test_non_null_free_array_oop(dst, rklass_tmp, L);
3743       __ stop("unexpected null-free array");
3744       __ bind(L);
3745       BLOCK_COMMENT("} assert not null-free array");
3746     }
3747 #endif
3748 
3749     // It is safe to examine both src.length and dst.length.
3750     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3751                            rax, L_failed);
3752 
3753     const Register r11_dst_klass = r11;
3754     __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3755 
3756     // Marshal the base address arguments now, freeing registers.
3757     __ lea(from, Address(src, src_pos, TIMES_OOP,
3758                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3759     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3760                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3761     __ movl(count, length);           // length (reloaded)
3762     Register sco_temp = c_rarg3;      // this register is free now
3763     assert_different_registers(from, to, count, sco_temp,
3764                                r11_dst_klass, r10_src_klass);
3765     assert_clean_int(count, sco_temp);
3766 
3767     // Generate the type check.
3768     const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3769     __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3770     assert_clean_int(sco_temp, rax);
3771     generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3772 
3773     // Fetch destination element klass from the ObjArrayKlass header.
3774     int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3775     __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3776     __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3777     assert_clean_int(sco_temp, rax);
3778 
3779 #ifdef _WIN64
3780     __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3781 #endif
3782 
3783     // the checkcast_copy loop needs two extra arguments:
3784     assert(c_rarg3 == sco_temp, "#3 already in place");
3785     // Set up arguments for checkcast_copy_entry.
3786     setup_arg_regs_using_thread(4);
3787     __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3788     __ jump(RuntimeAddress(checkcast_copy_entry));
3789   }
3790 
3791 __ BIND(L_failed);
3792 #ifdef _WIN64
3793   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3794 #endif
3795   __ xorptr(rax, rax);
3796   __ notptr(rax); // return -1
3797   __ leave();   // required for proper stackwalking of RuntimeStub frame
3798   __ ret(0);
3799 
3800   // record the stub entry and end
3801   store_archive_data(stub_id, start, __ pc());
3802 
3803   return start;
3804 }
3805 
3806 #undef __