1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/macroAssembler.hpp"
  26 #include "gc/shared/barrierSet.hpp"
  27 #include "gc/shared/barrierSetAssembler.hpp"
  28 #include "oops/objArrayKlass.hpp"
  29 #include "runtime/sharedRuntime.hpp"
  30 #include "runtime/stubRoutines.hpp"
  31 #include "stubGenerator_x86_64.hpp"
  32 #ifdef COMPILER2
  33 #include "opto/c2_globals.hpp"
  34 #endif
  35 #if INCLUDE_JVMCI
  36 #include "jvmci/jvmci_globals.hpp"
  37 #endif
  38 
  39 #define __ _masm->
  40 
  41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #else
  46 #define BLOCK_COMMENT(str) __ block_comment(str)
  47 #endif // PRODUCT
  48 
  49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  50 
  51 #ifdef PRODUCT
  52 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
  53 #else
  54 #define INC_COUNTER_NP(counter, rscratch) \
  55 BLOCK_COMMENT("inc_counter " #counter); \
  56 inc_counter_np(_masm, counter, rscratch);
  57 
  58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
  59   __ incrementl(ExternalAddress((address)&counter), rscratch);
  60 }
  61 
  62 #if COMPILER2_OR_JVMCI
  63 static uint& get_profile_ctr(int shift) {
  64   if (shift == 0) {
  65     return SharedRuntime::_jbyte_array_copy_ctr;
  66   } else if (shift == 1) {
  67     return SharedRuntime::_jshort_array_copy_ctr;
  68   } else if (shift == 2) {
  69     return SharedRuntime::_jint_array_copy_ctr;
  70   } else {
  71     assert(shift == 3, "");
  72     return SharedRuntime::_jlong_array_copy_ctr;
  73   }
  74 }
  75 #endif // COMPILER2_OR_JVMCI
  76 #endif // !PRODUCT
  77 
  78 void StubGenerator::generate_arraycopy_stubs() {
  79   // Some copy stubs publish a normal entry and then a 2nd 'fallback'
  80   // entry immediately following their stack push. This can be used
  81   // as a post-push branch target for compatible stubs when they
  82   // identify a special case that can be handled by the fallback
  83   // stub e.g a disjoint copy stub may be use as a special case
  84   // fallback for its compatible conjoint copy stub.
  85   //
  86   // A no push entry is always returned in the following local and
  87   // then published by assigning to the appropriate entry field in
  88   // class StubRoutines. The entry value is then passed to the
  89   // generator for the compatible stub. That means the entry must be
  90   // listed when saving to/restoring from the AOT cache, ensuring
  91   // that the inter-stub jumps are noted at AOT-cache save and
  92   // relocated at AOT cache load.
  93   address nopush_entry;
  94 
  95   StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(&nopush_entry);
  96   // disjoint nopush entry is needed by conjoint copy
  97   StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
  98   StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
  99   // conjoint nopush entry is needed by generic/unsafe copy
 100   StubRoutines::_jbyte_arraycopy_nopush    = nopush_entry;
 101 
 102   StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
 103   // disjoint nopush entry is needed by conjoint copy
 104   StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
 105   StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 106   // conjoint nopush entry is needed by generic/unsafe copy
 107   StubRoutines::_jshort_arraycopy_nopush   = nopush_entry;
 108 
 109   StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 110   // disjoint nopush entry is needed by conjoint copy
 111   StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
 112   StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 113   // conjoint nopush entry is needed by generic/unsafe copy
 114   StubRoutines::_jint_arraycopy_nopush     = nopush_entry;
 115 
 116   StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
 117   // disjoint nopush entry is needed by conjoint copy
 118   StubRoutines::_jlong_disjoint_arraycopy_nopush  = nopush_entry;
 119   StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 120   // conjoint nopush entry is needed by generic/unsafe copy
 121   StubRoutines::_jlong_arraycopy_nopush    = nopush_entry;
 122 
 123   if (UseCompressedOops) {
 124     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
 125     // disjoint nopush entry is needed by conjoint copy
 126     StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
 127     StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
 128     // conjoint nopush entry is needed by generic/unsafe copy
 129     StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
 130     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 131     // disjoint nopush entry is needed by conjoint copy
 132     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
 133     // note that we don't need a returned nopush entry because the
 134     // generic/unsafe copy does not cater for uninit arrays.
 135     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 136   } else {
 137     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
 138     // disjoint nopush entry is needed by conjoint copy
 139     StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
 140     StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
 141     // conjoint nopush entry is needed by generic/unsafe copy
 142     StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
 143     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 144     // disjoint nopush entry is needed by conjoint copy
 145     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
 146     // note that we don't need a returned nopush entry because the
 147     // generic/unsafe copy does not cater for uninit arrays.
 148     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 149   }
 150 
 151   StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 152   // checkcast nopush entry is needed by generic copy
 153   StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 154   // note that we don't need a returned nopush entry because the
 155   // generic copy does not cater for uninit arrays.
 156   StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 157 
 158   StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 159                                                             StubRoutines::_jshort_arraycopy_nopush,
 160                                                             StubRoutines::_jint_arraycopy_nopush,
 161                                                             StubRoutines::_jlong_arraycopy_nopush);
 162   StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 163                                                              StubRoutines::_jshort_arraycopy_nopush,
 164                                                              StubRoutines::_jint_arraycopy_nopush,
 165                                                              StubRoutines::_oop_arraycopy_nopush,
 166                                                              StubRoutines::_jlong_arraycopy_nopush,
 167                                                              StubRoutines::_checkcast_arraycopy_nopush);
 168 
 169   StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 170   StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 171   StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 172   StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 173   StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 174   StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 175 
 176   StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
 177 
 178   // We don't generate specialized code for HeapWord-aligned source
 179   // arrays, so just use the code we've already generated
 180   StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
 181   StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
 182 
 183   StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
 184   StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
 185 
 186   StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
 187   StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
 188 
 189   StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
 190   StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
 191 
 192   StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
 193   StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
 194 
 195   StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
 196   StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
 197 }
 198 
 199 
 200 // Verify that a register contains clean 32-bits positive value
 201 // (high 32-bits are 0) so it could be used in 64-bits shifts.
 202 //
 203 //  Input:
 204 //    Rint  -  32-bits value
 205 //    Rtmp  -  scratch
 206 //
 207 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
 208 #ifdef ASSERT
 209   Label L;
 210   assert_different_registers(Rtmp, Rint);
 211   __ movslq(Rtmp, Rint);
 212   __ cmpq(Rtmp, Rint);
 213   __ jcc(Assembler::equal, L);
 214   __ stop("high 32-bits of int value are not 0");
 215   __ bind(L);
 216 #endif
 217 }
 218 
 219 
 220 //  Generate overlap test for array copy stubs
 221 //
 222 //  Input:
 223 //     c_rarg0 - from
 224 //     c_rarg1 - to
 225 //     c_rarg2 - element count
 226 //
 227 //  Output:
 228 //     rax   - &from[element count - 1]
 229 //
 230 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
 231   const Register from     = c_rarg0;
 232   const Register to       = c_rarg1;
 233   const Register count    = c_rarg2;
 234   const Register end_from = rax;
 235 
 236   __ cmpptr(to, from);
 237   __ lea(end_from, Address(from, count, sf, 0));
 238   if (NOLp == nullptr) {
 239     RuntimeAddress no_overlap(no_overlap_target);
 240     __ jump_cc(Assembler::belowEqual, no_overlap);
 241     __ cmpptr(to, end_from);
 242     __ jump_cc(Assembler::aboveEqual, no_overlap);
 243   } else {
 244     __ jcc(Assembler::belowEqual, (*NOLp));
 245     __ cmpptr(to, end_from);
 246     __ jcc(Assembler::aboveEqual, (*NOLp));
 247   }
 248 }
 249 
 250 
 251 // Copy big chunks forward
 252 //
 253 // Inputs:
 254 //   end_from     - source arrays end address
 255 //   end_to       - destination array end address
 256 //   qword_count  - 64-bits element count, negative
 257 //   tmp1         - scratch
 258 //   L_copy_bytes - entry label
 259 //   L_copy_8_bytes  - exit  label
 260 //
 261 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
 262                                        Register qword_count, Register tmp1,
 263                                        Register tmp2, Label& L_copy_bytes,
 264                                        Label& L_copy_8_bytes, DecoratorSet decorators,
 265                                        BasicType type) {
 266   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 267   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 268   Label L_loop;
 269   __ align(OptoLoopAlignment);
 270   if (UseUnalignedLoadStores) {
 271     Label L_end;
 272     __ BIND(L_loop);
 273     if (UseAVX >= 2) {
 274       bs->copy_load_at(_masm, decorators, type, 32,
 275                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 276                        tmp1, xmm1);
 277       bs->copy_store_at(_masm, decorators, type, 32,
 278                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 279                         tmp1, tmp2, xmm1);
 280 
 281       bs->copy_load_at(_masm, decorators, type, 32,
 282                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 283                        tmp1, xmm1);
 284       bs->copy_store_at(_masm, decorators, type, 32,
 285                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 286                         tmp1, tmp2, xmm1);
 287     } else {
 288       bs->copy_load_at(_masm, decorators, type, 16,
 289                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 290                        tmp1, xmm1);
 291       bs->copy_store_at(_masm, decorators, type, 16,
 292                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 293                         tmp1, tmp2, xmm1);
 294       bs->copy_load_at(_masm, decorators, type, 16,
 295                        xmm0, Address(end_from, qword_count, Address::times_8, -40),
 296                        tmp1, xmm1);
 297       bs->copy_store_at(_masm, decorators, type, 16,
 298                         Address(end_to, qword_count, Address::times_8, -40), xmm0,
 299                         tmp1, tmp2, xmm1);
 300       bs->copy_load_at(_masm, decorators, type, 16,
 301                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 302                        tmp1, xmm1);
 303       bs->copy_store_at(_masm, decorators, type, 16,
 304                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 305                         tmp1, tmp2, xmm1);
 306       bs->copy_load_at(_masm, decorators, type, 16,
 307                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 308                        tmp1, xmm1);
 309       bs->copy_store_at(_masm, decorators, type, 16,
 310                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 311                         tmp1, tmp2, xmm1);
 312     }
 313 
 314     __ BIND(L_copy_bytes);
 315     __ addptr(qword_count, 8);
 316     __ jcc(Assembler::lessEqual, L_loop);
 317     __ subptr(qword_count, 4);  // sub(8) and add(4)
 318     __ jcc(Assembler::greater, L_end);
 319     // Copy trailing 32 bytes
 320     if (UseAVX >= 2) {
 321       bs->copy_load_at(_masm, decorators, type, 32,
 322                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 323                        tmp1, xmm1);
 324       bs->copy_store_at(_masm, decorators, type, 32,
 325                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 326                         tmp1, tmp2, xmm1);
 327     } else {
 328       bs->copy_load_at(_masm, decorators, type, 16,
 329                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 330                        tmp1, xmm1);
 331       bs->copy_store_at(_masm, decorators, type, 16,
 332                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 333                         tmp1, tmp2, xmm1);
 334       bs->copy_load_at(_masm, decorators, type, 16,
 335                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 336                        tmp1, xmm1);
 337       bs->copy_store_at(_masm, decorators, type, 16,
 338                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 339                         tmp1, tmp2, xmm1);
 340     }
 341     __ addptr(qword_count, 4);
 342     __ BIND(L_end);
 343   } else {
 344     // Copy 32-bytes per iteration
 345     __ BIND(L_loop);
 346     bs->copy_load_at(_masm, decorators, type, 8,
 347                      tmp1, Address(end_from, qword_count, Address::times_8, -24),
 348                      tmp2);
 349     bs->copy_store_at(_masm, decorators, type, 8,
 350                       Address(end_to, qword_count, Address::times_8, -24), tmp1,
 351                       tmp2);
 352     bs->copy_load_at(_masm, decorators, type, 8,
 353                      tmp1, Address(end_from, qword_count, Address::times_8, -16),
 354                      tmp2);
 355     bs->copy_store_at(_masm, decorators, type, 8,
 356                       Address(end_to, qword_count, Address::times_8, -16), tmp1,
 357                       tmp2);
 358     bs->copy_load_at(_masm, decorators, type, 8,
 359                      tmp1, Address(end_from, qword_count, Address::times_8, -8),
 360                      tmp2);
 361     bs->copy_store_at(_masm, decorators, type, 8,
 362                       Address(end_to, qword_count, Address::times_8, -8), tmp1,
 363                       tmp2);
 364     bs->copy_load_at(_masm, decorators, type, 8,
 365                      tmp1, Address(end_from, qword_count, Address::times_8, 0),
 366                      tmp2);
 367     bs->copy_store_at(_masm, decorators, type, 8,
 368                       Address(end_to, qword_count, Address::times_8, 0), tmp1,
 369                       tmp2);
 370 
 371     __ BIND(L_copy_bytes);
 372     __ addptr(qword_count, 4);
 373     __ jcc(Assembler::lessEqual, L_loop);
 374   }
 375   __ subptr(qword_count, 4);
 376   __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
 377 }
 378 
 379 
 380 // Copy big chunks backward
 381 //
 382 // Inputs:
 383 //   from         - source arrays address
 384 //   dest         - destination array address
 385 //   qword_count  - 64-bits element count
 386 //   tmp1         - scratch
 387 //   L_copy_bytes - entry label
 388 //   L_copy_8_bytes  - exit  label
 389 //
 390 void StubGenerator::copy_bytes_backward(Register from, Register dest,
 391                                         Register qword_count, Register tmp1,
 392                                         Register tmp2, Label& L_copy_bytes,
 393                                         Label& L_copy_8_bytes, DecoratorSet decorators,
 394                                         BasicType type) {
 395   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 396   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 397   Label L_loop;
 398   __ align(OptoLoopAlignment);
 399   if (UseUnalignedLoadStores) {
 400     Label L_end;
 401     __ BIND(L_loop);
 402     if (UseAVX >= 2) {
 403       bs->copy_load_at(_masm, decorators, type, 32,
 404                        xmm0, Address(from, qword_count, Address::times_8, 32),
 405                        tmp1, xmm1);
 406       bs->copy_store_at(_masm, decorators, type, 32,
 407                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 408                         tmp1, tmp2, xmm1);
 409       bs->copy_load_at(_masm, decorators, type, 32,
 410                        xmm0, Address(from, qword_count, Address::times_8, 0),
 411                        tmp1, xmm1);
 412       bs->copy_store_at(_masm, decorators, type, 32,
 413                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 414                         tmp1, tmp2, xmm1);
 415     } else {
 416       bs->copy_load_at(_masm, decorators, type, 16,
 417                        xmm0, Address(from, qword_count, Address::times_8, 48),
 418                        tmp1, xmm1);
 419       bs->copy_store_at(_masm, decorators, type, 16,
 420                         Address(dest, qword_count, Address::times_8, 48), xmm0,
 421                         tmp1, tmp2, xmm1);
 422       bs->copy_load_at(_masm, decorators, type, 16,
 423                        xmm0, Address(from, qword_count, Address::times_8, 32),
 424                        tmp1, xmm1);
 425       bs->copy_store_at(_masm, decorators, type, 16,
 426                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 427                         tmp1, tmp2, xmm1);
 428       bs->copy_load_at(_masm, decorators, type, 16,
 429                        xmm0, Address(from, qword_count, Address::times_8, 16),
 430                        tmp1, xmm1);
 431       bs->copy_store_at(_masm, decorators, type, 16,
 432                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 433                         tmp1, tmp2, xmm1);
 434       bs->copy_load_at(_masm, decorators, type, 16,
 435                        xmm0, Address(from, qword_count, Address::times_8, 0),
 436                        tmp1, xmm1);
 437       bs->copy_store_at(_masm, decorators, type, 16,
 438                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 439                         tmp1, tmp2, xmm1);
 440     }
 441 
 442     __ BIND(L_copy_bytes);
 443     __ subptr(qword_count, 8);
 444     __ jcc(Assembler::greaterEqual, L_loop);
 445 
 446     __ addptr(qword_count, 4);  // add(8) and sub(4)
 447     __ jcc(Assembler::less, L_end);
 448     // Copy trailing 32 bytes
 449     if (UseAVX >= 2) {
 450       bs->copy_load_at(_masm, decorators, type, 32,
 451                        xmm0, Address(from, qword_count, Address::times_8, 0),
 452                        tmp1, xmm1);
 453       bs->copy_store_at(_masm, decorators, type, 32,
 454                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 455                         tmp1, tmp2, xmm1);
 456     } else {
 457       bs->copy_load_at(_masm, decorators, type, 16,
 458                        xmm0, Address(from, qword_count, Address::times_8, 16),
 459                        tmp1, xmm1);
 460       bs->copy_store_at(_masm, decorators, type, 16,
 461                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 462                         tmp1, tmp2, xmm1);
 463       bs->copy_load_at(_masm, decorators, type, 16,
 464                        xmm0, Address(from, qword_count, Address::times_8, 0),
 465                        tmp1, xmm1);
 466       bs->copy_store_at(_masm, decorators, type, 16,
 467                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 468                         tmp1, tmp2, xmm1);
 469     }
 470     __ subptr(qword_count, 4);
 471     __ BIND(L_end);
 472   } else {
 473     // Copy 32-bytes per iteration
 474     __ BIND(L_loop);
 475     bs->copy_load_at(_masm, decorators, type, 8,
 476                      tmp1, Address(from, qword_count, Address::times_8, 24),
 477                      tmp2);
 478     bs->copy_store_at(_masm, decorators, type, 8,
 479                       Address(dest, qword_count, Address::times_8, 24), tmp1,
 480                       tmp2);
 481     bs->copy_load_at(_masm, decorators, type, 8,
 482                      tmp1, Address(from, qword_count, Address::times_8, 16),
 483                      tmp2);
 484     bs->copy_store_at(_masm, decorators, type, 8,
 485                       Address(dest, qword_count, Address::times_8, 16), tmp1,
 486                       tmp2);
 487     bs->copy_load_at(_masm, decorators, type, 8,
 488                      tmp1, Address(from, qword_count, Address::times_8, 8),
 489                      tmp2);
 490     bs->copy_store_at(_masm, decorators, type, 8,
 491                       Address(dest, qword_count, Address::times_8, 8), tmp1,
 492                       tmp2);
 493     bs->copy_load_at(_masm, decorators, type, 8,
 494                      tmp1, Address(from, qword_count, Address::times_8, 0),
 495                      tmp2);
 496     bs->copy_store_at(_masm, decorators, type, 8,
 497                       Address(dest, qword_count, Address::times_8, 0), tmp1,
 498                       tmp2);
 499 
 500     __ BIND(L_copy_bytes);
 501     __ subptr(qword_count, 4);
 502     __ jcc(Assembler::greaterEqual, L_loop);
 503   }
 504   __ addptr(qword_count, 4);
 505   __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
 506 }
 507 
 508 #if COMPILER2_OR_JVMCI
 509 
 510 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
 511 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 512 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 513 //   default configuration.
 514 // - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 515 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 516 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 517 //   better performance for disjoint copies. For conjoint/backward copy vector based
 518 //   copy performs better.
 519 // - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
 520 //   64 byte vector registers (ZMMs).
 521 
 522 // Inputs:
 523 //   c_rarg0   - source array address
 524 //   c_rarg1   - destination array address
 525 //   c_rarg2   - element count, treated as ssize_t, can be zero
 526 //
 527 //
 528 // Side Effects:
 529 //   disjoint_copy_avx3_masked is set to the no-overlap entry point
 530 //   used by generate_conjoint_[byte/int/short/long]_copy().
 531 //
 532 address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry) {
 533   // aligned is always false -- x86_64 always uses the unaligned code
 534   const bool aligned = false;
 535   int shift;
 536   bool is_oop;
 537   bool dest_uninitialized;
 538 
 539   switch (stub_id) {
 540   case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 541     shift = 0;
 542     is_oop = false;
 543     dest_uninitialized = false;
 544     break;
 545   case StubId::stubgen_jshort_disjoint_arraycopy_id:
 546     shift = 1;
 547     is_oop = false;
 548     dest_uninitialized = false;
 549     break;
 550   case StubId::stubgen_jint_disjoint_arraycopy_id:
 551     shift = 2;
 552     is_oop = false;
 553     dest_uninitialized = false;
 554     break;
 555   case StubId::stubgen_jlong_disjoint_arraycopy_id:
 556     shift = 3;
 557     is_oop = false;
 558     dest_uninitialized = false;
 559     break;
 560   case StubId::stubgen_oop_disjoint_arraycopy_id:
 561     shift = (UseCompressedOops ? 2 : 3);
 562     is_oop = true;
 563     dest_uninitialized = false;
 564     break;
 565   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 566     shift = (UseCompressedOops ? 2 : 3);
 567     is_oop = true;
 568     dest_uninitialized = true;
 569     break;
 570   default:
 571     ShouldNotReachHere();
 572   }
 573   GrowableArray<address> entries;
 574   GrowableArray<address> extras;
 575   bool add_handlers = !is_oop && !aligned;
 576   bool add_relocs = UseZGC && is_oop;
 577   bool add_extras = add_handlers || add_relocs;
 578   // The stub employs one unsafe handler region by default but has two
 579   // when MaxVectorSize == 64 So we may expect 0, 3 or 6 extras.
 580   int handlers_count = (MaxVectorSize == 64 ? 2 : 1);
 581   int expected_entry_count = (entry != nullptr ? 2 : 1);
 582   int expected_extra_count = (add_handlers ? handlers_count : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/1/2 x UMAM {start,end,handler}
 583   int entry_count = StubInfo::entry_count(stub_id);
 584   assert(entry_count == expected_entry_count, "sanity check");
 585   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
 586   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
 587   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 588   if (start != nullptr) {
 589     assert(entries.length() == expected_entry_count - 1,
 590            "unexpected entry count %d", entries.length());
 591     assert(!add_handlers || extras.length() == expected_extra_count,
 592            "unexpected handler addresses count %d", extras.length());
 593     if (entry != nullptr) {
 594       *entry = entries.at(0);
 595     }
 596     if (add_handlers) {
 597       // restore 1/2 x UMAM {start,end,handler} addresses from extras
 598       register_unsafe_access_handlers(extras, 0, handlers_count);
 599     }
 600 #if INCLUDE_ZGC
 601     // register addresses at which ZGC does colour patching
 602     if (add_relocs)  {
 603       register_reloc_addresses(extras, 0, extras.length());
 604     }
 605 #endif // INCLUDE_ZGC
 606     return start;
 607   }
 608 
 609   __ align(CodeEntryAlignment);
 610   StubCodeMark mark(this, stub_id);
 611   start = __ pc();
 612 
 613   bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
 614   const int large_threshold = 2621440; // 2.5 MB
 615   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 616   Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 617   Label L_copy_large, L_finish;
 618   const Register from        = rdi;  // source array address
 619   const Register to          = rsi;  // destination array address
 620   const Register count       = rdx;  // elements count
 621   const Register temp1       = r8;
 622   const Register temp2       = r11;
 623   const Register temp3       = rax;
 624   const Register temp4       = rcx;
 625   // End pointers are inclusive, and if count is not zero they point
 626   // to the last unit copied:  end_to[0] := end_from[0]
 627 
 628   __ enter(); // required for proper stackwalking of RuntimeStub frame
 629   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 630 
 631   if (entry != nullptr) {
 632     *entry = __ pc();
 633     entries.append(*entry);
 634      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 635     BLOCK_COMMENT("Entry:");
 636   }
 637 
 638   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 639   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 640 
 641   setup_argument_regs(type);
 642 
 643   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 644   if (dest_uninitialized) {
 645     decorators |= IS_DEST_UNINITIALIZED;
 646   }
 647   if (aligned) {
 648     decorators |= ARRAYCOPY_ALIGNED;
 649   }
 650   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 651   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 652 
 653   {
 654     // Type(shift)           byte(0), short(1), int(2),   long(3)
 655     int loop_size[]        = { 192,     96,       48,      24};
 656     int threshold[]        = { 4096,    2048,     1024,    512};
 657 
 658     // UnsafeMemoryAccess page error: continue after unsafe access
 659     UnsafeMemoryAccessMark umam(this, add_handlers, true);
 660     // 'from', 'to' and 'count' are now valid
 661 
 662     // temp1 holds remaining count and temp4 holds running count used to compute
 663     // next address offset for start of to/from addresses (temp4 * scale).
 664     __ mov64(temp4, 0);
 665     __ movq(temp1, count);
 666 
 667     // Zero length check.
 668     __ BIND(L_tail);
 669     __ cmpq(temp1, 0);
 670     __ jcc(Assembler::lessEqual, L_exit);
 671 
 672     // Special cases using 32 byte [masked] vector copy operations.
 673     arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 674                                  temp4, temp3, use64byteVector, L_entry, L_exit);
 675 
 676     // PRE-MAIN-POST loop for aligned copy.
 677     __ BIND(L_entry);
 678 
 679     if (MaxVectorSize == 64) {
 680       __ movq(temp2, temp1);
 681       __ shlq(temp2, shift);
 682       __ cmpq(temp2, large_threshold);
 683       __ jcc(Assembler::greaterEqual, L_copy_large);
 684     }
 685     if (CopyAVX3Threshold != 0) {
 686       __ cmpq(count, threshold[shift]);
 687       if (MaxVectorSize == 64) {
 688         // Copy using 64 byte vectors.
 689         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 690       } else {
 691         assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
 692         // REP MOVS offer a faster copy path.
 693         __ jcc(Assembler::greaterEqual, L_repmovs);
 694       }
 695     }
 696 
 697     if ((MaxVectorSize < 64)  || (CopyAVX3Threshold != 0)) {
 698       // Partial copy to make dst address 32 byte aligned.
 699       __ movq(temp2, to);
 700       __ andq(temp2, 31);
 701       __ jcc(Assembler::equal, L_main_pre_loop);
 702 
 703       __ negptr(temp2);
 704       __ addq(temp2, 32);
 705       if (shift) {
 706         __ shrq(temp2, shift);
 707       }
 708       __ movq(temp3, temp2);
 709       copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
 710       __ movq(temp4, temp2);
 711       __ movq(temp1, count);
 712       __ subq(temp1, temp2);
 713 
 714       __ cmpq(temp1, loop_size[shift]);
 715       __ jcc(Assembler::less, L_tail);
 716 
 717       __ BIND(L_main_pre_loop);
 718       __ subq(temp1, loop_size[shift]);
 719 
 720       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 721       __ align32();
 722       __ BIND(L_main_loop);
 723          copy64_avx(to, from, temp4, xmm1, false, shift, 0);
 724          copy64_avx(to, from, temp4, xmm1, false, shift, 64);
 725          copy64_avx(to, from, temp4, xmm1, false, shift, 128);
 726          __ addptr(temp4, loop_size[shift]);
 727          __ subq(temp1, loop_size[shift]);
 728          __ jcc(Assembler::greater, L_main_loop);
 729 
 730       __ addq(temp1, loop_size[shift]);
 731 
 732       // Tail loop.
 733       __ jmp(L_tail);
 734 
 735       __ BIND(L_repmovs);
 736         __ movq(temp2, temp1);
 737         // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
 738         __ movq(temp3, to);
 739         __ movq(to,  from);
 740         __ movq(from, temp3);
 741         // Save to/from for restoration post rep_mov.
 742         __ movq(temp1, to);
 743         __ movq(temp3, from);
 744         if(shift < 3) {
 745           __ shrq(temp2, 3-shift);     // quad word count
 746         }
 747         __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
 748         __ rep_mov();
 749         __ shlq(temp2, 3);             // convert quad words into byte count.
 750         if(shift) {
 751           __ shrq(temp2, shift);       // type specific count.
 752         }
 753         // Restore original addresses in to/from.
 754         __ movq(to, temp3);
 755         __ movq(from, temp1);
 756         __ movq(temp4, temp2);
 757         __ movq(temp1, count);
 758         __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
 759         __ jmp(L_tail);
 760     }
 761 
 762     if (MaxVectorSize > 32) {
 763       __ BIND(L_pre_main_post_64);
 764       // Partial copy to make dst address 64 byte aligned.
 765       __ movq(temp2, to);
 766       __ andq(temp2, 63);
 767       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 768 
 769       __ negptr(temp2);
 770       __ addq(temp2, 64);
 771       if (shift) {
 772         __ shrq(temp2, shift);
 773       }
 774       __ movq(temp3, temp2);
 775       copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
 776       __ movq(temp4, temp2);
 777       __ movq(temp1, count);
 778       __ subq(temp1, temp2);
 779 
 780       __ cmpq(temp1, loop_size[shift]);
 781       __ jcc(Assembler::less, L_tail64);
 782 
 783       __ BIND(L_main_pre_loop_64bytes);
 784       __ subq(temp1, loop_size[shift]);
 785 
 786       // Main loop with aligned copy block size of 192 bytes at
 787       // 64 byte copy granularity.
 788       __ align32();
 789       __ BIND(L_main_loop_64bytes);
 790          copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
 791          copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
 792          copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
 793          __ addptr(temp4, loop_size[shift]);
 794          __ subq(temp1, loop_size[shift]);
 795          __ jcc(Assembler::greater, L_main_loop_64bytes);
 796 
 797       __ addq(temp1, loop_size[shift]);
 798       // Zero length check.
 799       __ jcc(Assembler::lessEqual, L_exit);
 800 
 801       __ BIND(L_tail64);
 802 
 803       // Tail handling using 64 byte [masked] vector copy operations.
 804       use64byteVector = true;
 805       arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 806                                    temp4, temp3, use64byteVector, L_entry, L_exit);
 807     }
 808     __ BIND(L_exit);
 809   }
 810 
 811   __ BIND(L_finish);
 812   address ucme_exit_pc = __ pc();
 813   // When called from generic_arraycopy r11 contains specific values
 814   // used during arraycopy epilogue, re-initializing r11.
 815   if (is_oop) {
 816     __ movq(r11, shift == 3 ? count : to);
 817   }
 818   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 819   restore_argument_regs(type);
 820   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 821   __ xorptr(rax, rax); // return 0
 822   __ vzeroupper();
 823   __ leave(); // required for proper stackwalking of RuntimeStub frame
 824   __ ret(0);
 825 
 826   if (MaxVectorSize == 64) {
 827     __ BIND(L_copy_large);
 828       UnsafeMemoryAccessMark umam(this, add_handlers, false, ucme_exit_pc);
 829       arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
 830     __ jmp(L_finish);
 831   }
 832   // retrieve the registered handler addresses
 833   address end = __ pc();
 834   if (add_handlers) {
 835     retrieve_unsafe_access_handlers(start, end, extras);
 836   }
 837   assert(extras.length() == expected_extra_count,
 838          "unexpected handler addresses count %d", extras.length());
 839 #if INCLUDE_ZGC
 840   // retrieve addresses at which ZGC does colour patching
 841   if (add_relocs) {
 842     retrieve_reloc_addresses(start, end, extras);
 843   }
 844 #endif // INCLUDE_ZGC
 845 
 846   // record the stub entry and end plus the no_push entry and any
 847   // extra handler addresses
 848   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
 849 
 850   return start;
 851 }
 852 
 853 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
 854                                          Register temp3, Register temp4, Register count,
 855                                          XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
 856                                          XMMRegister xmm4, int shift) {
 857 
 858   // Type(shift)           byte(0), short(1), int(2),   long(3)
 859   int loop_size[]        = { 256,     128,       64,      32};
 860   int threshold[]        = { 4096,    2048,     1024,    512};
 861 
 862   Label L_main_loop_large;
 863   Label L_tail_large;
 864   Label L_exit_large;
 865   Label L_entry_large;
 866   Label L_main_pre_loop_large;
 867   Label L_pre_main_post_large;
 868 
 869   assert(MaxVectorSize == 64, "vector length != 64");
 870   __ BIND(L_entry_large);
 871 
 872   __ BIND(L_pre_main_post_large);
 873   // Partial copy to make dst address 64 byte aligned.
 874   __ movq(temp2, to);
 875   __ andq(temp2, 63);
 876   __ jcc(Assembler::equal, L_main_pre_loop_large);
 877 
 878   __ negptr(temp2);
 879   __ addq(temp2, 64);
 880   if (shift) {
 881     __ shrq(temp2, shift);
 882   }
 883   __ movq(temp3, temp2);
 884   copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
 885   __ movq(temp4, temp2);
 886   __ movq(temp1, count);
 887   __ subq(temp1, temp2);
 888 
 889   __ cmpq(temp1, loop_size[shift]);
 890   __ jcc(Assembler::less, L_tail_large);
 891 
 892   __ BIND(L_main_pre_loop_large);
 893   __ subq(temp1, loop_size[shift]);
 894 
 895   // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
 896   __ align32();
 897   __ BIND(L_main_loop_large);
 898   copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
 899   __ addptr(temp4, loop_size[shift]);
 900   __ subq(temp1, loop_size[shift]);
 901   __ jcc(Assembler::greater, L_main_loop_large);
 902   // fence needed because copy256_avx3 uses non-temporal stores
 903   __ sfence();
 904 
 905   __ addq(temp1, loop_size[shift]);
 906   // Zero length check.
 907   __ jcc(Assembler::lessEqual, L_exit_large);
 908   __ BIND(L_tail_large);
 909   // Tail handling using 64 byte [masked] vector copy operations.
 910   __ cmpq(temp1, 0);
 911   __ jcc(Assembler::lessEqual, L_exit_large);
 912   arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
 913                                temp4, temp3, L_exit_large);
 914   __ BIND(L_exit_large);
 915 }
 916 
 917 // Inputs:
 918 //   c_rarg0   - source array address
 919 //   c_rarg1   - destination array address
 920 //   c_rarg2   - element count, treated as ssize_t, can be zero
 921 //
 922 //
 923 address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, address* entry, address nooverlap_target) {
 924   // aligned is always false -- x86_64 always uses the unaligned code
 925   const bool aligned = false;
 926   int shift;
 927   bool is_oop;
 928   bool dest_uninitialized;
 929 
 930   switch (stub_id) {
 931   case StubId::stubgen_jbyte_arraycopy_id:
 932     shift = 0;
 933     is_oop = false;
 934     dest_uninitialized = false;
 935     break;
 936   case StubId::stubgen_jshort_arraycopy_id:
 937     shift = 1;
 938     is_oop = false;
 939     dest_uninitialized = false;
 940     break;
 941   case StubId::stubgen_jint_arraycopy_id:
 942     shift = 2;
 943     is_oop = false;
 944     dest_uninitialized = false;
 945     break;
 946   case StubId::stubgen_jlong_arraycopy_id:
 947     shift = 3;
 948     is_oop = false;
 949     dest_uninitialized = false;
 950     break;
 951   case StubId::stubgen_oop_arraycopy_id:
 952     shift = (UseCompressedOops ? 2 : 3);
 953     is_oop = true;
 954     dest_uninitialized = false;
 955     break;
 956   case StubId::stubgen_oop_arraycopy_uninit_id:
 957     shift = (UseCompressedOops ? 2 : 3);
 958     is_oop = true;
 959     dest_uninitialized = true;
 960     break;
 961   default:
 962     ShouldNotReachHere();
 963   }
 964   GrowableArray<address> entries;
 965   GrowableArray<address> extras;
 966   bool add_handlers = !is_oop && !aligned;
 967   bool add_relocs = UseZGC && is_oop;
 968   bool add_extras = add_handlers || add_relocs;
 969   int expected_entry_count = (entry != nullptr ? 2 : 1);
 970   int expected_handler_count = (add_handlers ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/1 x UMAM {start,end,handler}
 971   int entry_count = StubInfo::entry_count(stub_id);
 972   assert(entry_count == expected_entry_count, "sanity check");
 973   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
 974   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
 975   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 976   if (start != nullptr) {
 977     assert(entries.length() == expected_entry_count - 1,
 978            "unexpected entry count %d", entries.length());
 979     assert(!add_handlers || extras.length() == expected_handler_count,
 980            "unexpected handler addresses count %d", extras.length());
 981     if (entry != nullptr) {
 982       *entry = entries.at(0);
 983     }
 984     if (add_handlers) {
 985       // restore 1 x UMAM {start,end,handler} addresses from extras
 986       register_unsafe_access_handlers(extras, 0, 1);
 987     }
 988 #if INCLUDE_ZGC
 989     if (add_relocs)  {
 990       // register addresses at which ZGC does colour patching
 991       register_reloc_addresses(extras, 0, extras.length());
 992     }
 993 #endif // INCLUDE_ZGC
 994     return start;
 995   }
 996   __ align(CodeEntryAlignment);
 997   StubCodeMark mark(this, stub_id);
 998   start = __ pc();
 999 
1000   bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
1001 
1002   Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1003   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1004   const Register from        = rdi;  // source array address
1005   const Register to          = rsi;  // destination array address
1006   const Register count       = rdx;  // elements count
1007   const Register temp1       = r8;
1008   const Register temp2       = rcx;
1009   const Register temp3       = r11;
1010   const Register temp4       = rax;
1011   // End pointers are inclusive, and if count is not zero they point
1012   // to the last unit copied:  end_to[0] := end_from[0]
1013 
1014   __ enter(); // required for proper stackwalking of RuntimeStub frame
1015   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1016 
1017   if (entry != nullptr) {
1018     *entry = __ pc();
1019     entries.append(*entry);
1020      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1021     BLOCK_COMMENT("Entry:");
1022   }
1023 
1024   array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1025 
1026   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1027   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1028 
1029   setup_argument_regs(type);
1030 
1031   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1032   if (dest_uninitialized) {
1033     decorators |= IS_DEST_UNINITIALIZED;
1034   }
1035   if (aligned) {
1036     decorators |= ARRAYCOPY_ALIGNED;
1037   }
1038   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1039   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1040   {
1041     // Type(shift)       byte(0), short(1), int(2),   long(3)
1042     int loop_size[]   = { 192,     96,       48,      24};
1043     int threshold[]   = { 4096,    2048,     1024,    512};
1044 
1045     // UnsafeMemoryAccess page error: continue after unsafe access
1046     UnsafeMemoryAccessMark umam(this, add_handlers, true);
1047     // 'from', 'to' and 'count' are now valid
1048 
1049     // temp1 holds remaining count.
1050     __ movq(temp1, count);
1051 
1052     // Zero length check.
1053     __ BIND(L_tail);
1054     __ cmpq(temp1, 0);
1055     __ jcc(Assembler::lessEqual, L_exit);
1056 
1057     __ mov64(temp2, 0);
1058     __ movq(temp3, temp1);
1059     // Special cases using 32 byte [masked] vector copy operations.
1060     arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1061                                           temp4, use64byteVector, L_entry, L_exit);
1062 
1063     // PRE-MAIN-POST loop for aligned copy.
1064     __ BIND(L_entry);
1065 
1066     if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) {
1067       __ cmpq(temp1, threshold[shift]);
1068       __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1069     }
1070 
1071     if ((MaxVectorSize < 64)  || (CopyAVX3Threshold != 0)) {
1072       // Partial copy to make dst address 32 byte aligned.
1073       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1074       __ andq(temp2, 31);
1075       __ jcc(Assembler::equal, L_main_pre_loop);
1076 
1077       if (shift) {
1078         __ shrq(temp2, shift);
1079       }
1080       __ subq(temp1, temp2);
1081       copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1082 
1083       __ cmpq(temp1, loop_size[shift]);
1084       __ jcc(Assembler::less, L_tail);
1085 
1086       __ BIND(L_main_pre_loop);
1087 
1088       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1089       __ align32();
1090       __ BIND(L_main_loop);
1091          copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1092          copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1093          copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1094          __ subptr(temp1, loop_size[shift]);
1095          __ cmpq(temp1, loop_size[shift]);
1096          __ jcc(Assembler::greater, L_main_loop);
1097 
1098       // Tail loop.
1099       __ jmp(L_tail);
1100     }
1101 
1102     if (MaxVectorSize > 32) {
1103       __ BIND(L_pre_main_post_64);
1104       // Partial copy to make dst address 64 byte aligned.
1105       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1106       __ andq(temp2, 63);
1107       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1108 
1109       if (shift) {
1110         __ shrq(temp2, shift);
1111       }
1112       __ subq(temp1, temp2);
1113       copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1114 
1115       __ cmpq(temp1, loop_size[shift]);
1116       __ jcc(Assembler::less, L_tail64);
1117 
1118       __ BIND(L_main_pre_loop_64bytes);
1119 
1120       // Main loop with aligned copy block size of 192 bytes at
1121       // 64 byte copy granularity.
1122       __ align32();
1123       __ BIND(L_main_loop_64bytes);
1124          copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1125          copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1126          copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1127          __ subq(temp1, loop_size[shift]);
1128          __ cmpq(temp1, loop_size[shift]);
1129          __ jcc(Assembler::greater, L_main_loop_64bytes);
1130 
1131       // Zero length check.
1132       __ cmpq(temp1, 0);
1133       __ jcc(Assembler::lessEqual, L_exit);
1134 
1135       __ BIND(L_tail64);
1136 
1137       // Tail handling using 64 byte [masked] vector copy operations.
1138       use64byteVector = true;
1139       __ mov64(temp2, 0);
1140       __ movq(temp3, temp1);
1141       arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1142                                             temp4, use64byteVector, L_entry, L_exit);
1143     }
1144     __ BIND(L_exit);
1145   }
1146   address ucme_exit_pc = __ pc();
1147   // When called from generic_arraycopy r11 contains specific values
1148   // used during arraycopy epilogue, re-initializing r11.
1149   if(is_oop) {
1150     __ movq(r11, count);
1151   }
1152   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1153   restore_argument_regs(type);
1154   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
1155   __ xorptr(rax, rax); // return 0
1156   __ vzeroupper();
1157   __ leave(); // required for proper stackwalking of RuntimeStub frame
1158   __ ret(0);
1159 
1160   // retrieve the registered handler addresses
1161   address end = __ pc();
1162   if (add_handlers) {
1163     retrieve_unsafe_access_handlers(start, end, extras);
1164   }
1165   assert(extras.length() == expected_handler_count,
1166          "unexpected handler addresses count %d", extras.length());
1167 #if INCLUDE_ZGC
1168   // retrieve addresses at which ZGC does colour patching
1169   if (add_relocs) {
1170     retrieve_reloc_addresses(start, end, extras);
1171   }
1172 #endif // INCLUDE_ZGC
1173   // record the stub entry and end plus the no_push entry and any
1174   // extra handler addresses
1175   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
1176 
1177   return start;
1178 }
1179 
1180 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
1181                                                  Register to, Register count, int shift,
1182                                                  Register index, Register temp,
1183                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
1184   Label L_entry_64, L_entry_96, L_entry_128;
1185   Label L_entry_160, L_entry_192;
1186 
1187   int size_mat[][6] = {
1188   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1189   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1190   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1191   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1192   };
1193 
1194   // Case A) Special case for length less than equal to 32 bytes.
1195   __ cmpq(count, size_mat[shift][0]);
1196   __ jccb(Assembler::greater, L_entry_64);
1197   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
1198   __ jmp(L_exit);
1199 
1200   // Case B) Special case for length less than equal to 64 bytes.
1201   __ BIND(L_entry_64);
1202   __ cmpq(count, size_mat[shift][1]);
1203   __ jccb(Assembler::greater, L_entry_96);
1204   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1205   __ jmp(L_exit);
1206 
1207   // Case C) Special case for length less than equal to 96 bytes.
1208   __ BIND(L_entry_96);
1209   __ cmpq(count, size_mat[shift][2]);
1210   __ jccb(Assembler::greater, L_entry_128);
1211   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1212   __ subq(count, 64 >> shift);
1213   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1214   __ jmp(L_exit);
1215 
1216   // Case D) Special case for length less than equal to 128 bytes.
1217   __ BIND(L_entry_128);
1218   __ cmpq(count, size_mat[shift][3]);
1219   __ jccb(Assembler::greater, L_entry_160);
1220   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1221   copy32_avx(to, from, index, xmm, shift, 64);
1222   __ subq(count, 96 >> shift);
1223   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1224   __ jmp(L_exit);
1225 
1226   // Case E) Special case for length less than equal to 160 bytes.
1227   __ BIND(L_entry_160);
1228   __ cmpq(count, size_mat[shift][4]);
1229   __ jccb(Assembler::greater, L_entry_192);
1230   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1231   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1232   __ subq(count, 128 >> shift);
1233   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1234   __ jmp(L_exit);
1235 
1236   // Case F) Special case for length less than equal to 192 bytes.
1237   __ BIND(L_entry_192);
1238   __ cmpq(count, size_mat[shift][5]);
1239   __ jcc(Assembler::greater, L_entry);
1240   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1241   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1242   copy32_avx(to, from, index, xmm, shift, 128);
1243   __ subq(count, 160 >> shift);
1244   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1245   __ jmp(L_exit);
1246 }
1247 
1248 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1249                                                      Register to, Register count, int shift, Register index,
1250                                                      Register temp, Label& L_exit) {
1251   Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1252 
1253   int size_mat[][4] = {
1254   /* T_BYTE */ {64, 128, 192, 256},
1255   /* T_SHORT*/ {32, 64 , 96 , 128},
1256   /* T_INT  */ {16, 32 , 48 ,  64},
1257   /* T_LONG */ { 8, 16 , 24 ,  32}
1258   };
1259 
1260   assert(MaxVectorSize == 64, "vector length != 64");
1261   // Case A) Special case for length less than or equal to 64 bytes.
1262   __ BIND(L_entry_64);
1263   __ cmpq(count, size_mat[shift][0]);
1264   __ jccb(Assembler::greater, L_entry_128);
1265   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1266   __ jmp(L_exit);
1267 
1268   // Case B) Special case for length less than or equal to 128 bytes.
1269   __ BIND(L_entry_128);
1270   __ cmpq(count, size_mat[shift][1]);
1271   __ jccb(Assembler::greater, L_entry_192);
1272   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1273   __ subq(count, 64 >> shift);
1274   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1275   __ jmp(L_exit);
1276 
1277   // Case C) Special case for length less than or equal to 192 bytes.
1278   __ BIND(L_entry_192);
1279   __ cmpq(count, size_mat[shift][2]);
1280   __ jcc(Assembler::greater, L_entry_256);
1281   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1282   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1283   __ subq(count, 128 >> shift);
1284   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1285   __ jmp(L_exit);
1286 
1287   // Case D) Special case for length less than or equal to 256 bytes.
1288   __ BIND(L_entry_256);
1289   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1290   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1291   copy64_avx(to, from, index, xmm, false, shift, 128, true);
1292   __ subq(count, 192 >> shift);
1293   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1294   __ jmp(L_exit);
1295 }
1296 
1297 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1298                                                            Register to, Register start_index, Register end_index,
1299                                                            Register count, int shift, Register temp,
1300                                                            bool use64byteVector, Label& L_entry, Label& L_exit) {
1301   Label L_entry_64, L_entry_96, L_entry_128;
1302   Label L_entry_160, L_entry_192;
1303   bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
1304 
1305   int size_mat[][6] = {
1306   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1307   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1308   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1309   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1310   };
1311 
1312   // Case A) Special case for length less than equal to 32 bytes.
1313   __ cmpq(count, size_mat[shift][0]);
1314   __ jccb(Assembler::greater, L_entry_64);
1315   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1316   __ jmp(L_exit);
1317 
1318   // Case B) Special case for length less than equal to 64 bytes.
1319   __ BIND(L_entry_64);
1320   __ cmpq(count, size_mat[shift][1]);
1321   __ jccb(Assembler::greater, L_entry_96);
1322   if (avx3) {
1323      copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1324   } else {
1325      copy32_avx(to, from, end_index, xmm, shift, -32);
1326      __ subq(count, 32 >> shift);
1327      copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1328   }
1329   __ jmp(L_exit);
1330 
1331   // Case C) Special case for length less than equal to 96 bytes.
1332   __ BIND(L_entry_96);
1333   __ cmpq(count, size_mat[shift][2]);
1334   __ jccb(Assembler::greater, L_entry_128);
1335   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1336   __ subq(count, 64 >> shift);
1337   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1338   __ jmp(L_exit);
1339 
1340   // Case D) Special case for length less than equal to 128 bytes.
1341   __ BIND(L_entry_128);
1342   __ cmpq(count, size_mat[shift][3]);
1343   __ jccb(Assembler::greater, L_entry_160);
1344   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1345   copy32_avx(to, from, end_index, xmm, shift, -96);
1346   __ subq(count, 96 >> shift);
1347   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1348   __ jmp(L_exit);
1349 
1350   // Case E) Special case for length less than equal to 160 bytes.
1351   __ BIND(L_entry_160);
1352   __ cmpq(count, size_mat[shift][4]);
1353   __ jccb(Assembler::greater, L_entry_192);
1354   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1355   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1356   __ subq(count, 128 >> shift);
1357   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1358   __ jmp(L_exit);
1359 
1360   // Case F) Special case for length less than equal to 192 bytes.
1361   __ BIND(L_entry_192);
1362   __ cmpq(count, size_mat[shift][5]);
1363   __ jcc(Assembler::greater, L_entry);
1364   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1365   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1366   copy32_avx(to, from, end_index, xmm, shift, -160);
1367   __ subq(count, 160 >> shift);
1368   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1369   __ jmp(L_exit);
1370 }
1371 
1372 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1373                                 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1374                                 int shift, int offset) {
1375   if (MaxVectorSize == 64) {
1376     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1377     __ prefetcht0(Address(src, index, scale, offset + 0x200));
1378     __ prefetcht0(Address(src, index, scale, offset + 0x240));
1379     __ prefetcht0(Address(src, index, scale, offset + 0x280));
1380     __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1381 
1382     __ prefetcht0(Address(src, index, scale, offset + 0x400));
1383     __ prefetcht0(Address(src, index, scale, offset + 0x440));
1384     __ prefetcht0(Address(src, index, scale, offset + 0x480));
1385     __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1386 
1387     __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1388     __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1389     __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1390     __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1391 
1392     __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1393     __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1394     __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1395     __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1396   }
1397 }
1398 
1399 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1400                                        KRegister mask, Register length, Register index,
1401                                        Register temp, int shift, int offset,
1402                                        bool use64byteVector) {
1403   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1404   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1405   if (!use64byteVector) {
1406     copy32_avx(dst, src, index, xmm, shift, offset);
1407     __ subptr(length, 32 >> shift);
1408     copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1409   } else {
1410     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1411     assert(MaxVectorSize == 64, "vector length != 64");
1412     __ mov64(temp, -1L);
1413     __ bzhiq(temp, temp, length);
1414     __ kmovql(mask, temp);
1415     __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1416     __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1417   }
1418 }
1419 
1420 
1421 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1422                                        KRegister mask, Register length, Register index,
1423                                        Register temp, int shift, int offset) {
1424   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1425   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1426   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1427   __ mov64(temp, -1L);
1428   __ bzhiq(temp, temp, length);
1429   __ kmovql(mask, temp);
1430   __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1431   __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1432 }
1433 
1434 
1435 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1436                                 int shift, int offset) {
1437   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1438   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1439   __ vmovdqu(xmm, Address(src, index, scale, offset));
1440   __ vmovdqu(Address(dst, index, scale, offset), xmm);
1441 }
1442 
1443 
1444 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1445                                 bool conjoint, int shift, int offset, bool use64byteVector) {
1446   assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1447   if (!use64byteVector) {
1448     if (conjoint) {
1449       copy32_avx(dst, src, index, xmm, shift, offset+32);
1450       copy32_avx(dst, src, index, xmm, shift, offset);
1451     } else {
1452       copy32_avx(dst, src, index, xmm, shift, offset);
1453       copy32_avx(dst, src, index, xmm, shift, offset+32);
1454     }
1455   } else {
1456     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1457     __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1458     __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1459   }
1460 }
1461 
1462 #endif // COMPILER2_OR_JVMCI
1463 
1464 
1465 // Arguments:
1466 //   entry     - location for return of (post-push) entry
1467 //
1468 // Inputs:
1469 //   c_rarg0   - source array address
1470 //   c_rarg1   - destination array address
1471 //   c_rarg2   - element count, treated as ssize_t, can be zero
1472 //
1473 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1474 // we let the hardware handle it.  The one to eight bytes within words,
1475 // dwords or qwords that span cache line boundaries will still be loaded
1476 // and stored atomically.
1477 //
1478 // Side Effects:
1479 //   entry is set to the no-overlap entry point
1480 //   used by generate_conjoint_byte_copy().
1481 //
1482 address StubGenerator::generate_disjoint_byte_copy(address* entry) {
1483   StubId stub_id = StubId::stubgen_jbyte_disjoint_arraycopy_id;
1484   // aligned is always false -- x86_64 always uses the unaligned code
1485   const bool aligned = false;
1486 #if COMPILER2_OR_JVMCI
1487   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1488     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1489   }
1490 #endif
1491   GrowableArray<address> entries;
1492   GrowableArray<address> extras;
1493   int expected_entry_count = (entry != nullptr ? 2 : 1);
1494   int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1495   int entry_count = StubInfo::entry_count(stub_id);
1496   assert(entry_count == expected_entry_count, "sanity check");
1497   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1498   address start = load_archive_data(stub_id, entries_ptr, &extras);
1499   if (start != nullptr) {
1500     assert(entries.length() == expected_entry_count - 1,
1501            "unexpected entry count %d", entries.length());
1502     assert(extras.length() == expected_handler_count,
1503            "unexpected handler addresses count %d", extras.length());
1504     if (entry != nullptr) {
1505       *entry = entries.at(0);
1506     }
1507     // restore 2 UMAM {start,end,handler} addresses from extras
1508     register_unsafe_access_handlers(extras, 0, 2);
1509     return start;
1510   }
1511   __ align(CodeEntryAlignment);
1512   StubCodeMark mark(this, stub_id);
1513   start = __ pc();
1514   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1515 
1516   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1517   Label L_copy_byte, L_exit;
1518   const Register from        = rdi;  // source array address
1519   const Register to          = rsi;  // destination array address
1520   const Register count       = rdx;  // elements count
1521   const Register byte_count  = rcx;
1522   const Register qword_count = count;
1523   const Register end_from    = from; // source array end address
1524   const Register end_to      = to;   // destination array end address
1525   // End pointers are inclusive, and if count is not zero they point
1526   // to the last unit copied:  end_to[0] := end_from[0]
1527 
1528   __ enter(); // required for proper stackwalking of RuntimeStub frame
1529   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1530 
1531   if (entry != nullptr) {
1532     *entry = __ pc();
1533     entries.append(*entry);
1534      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1535     BLOCK_COMMENT("Entry:");
1536   }
1537 
1538   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1539                     // r9 and r10 may be used to save non-volatile registers
1540 
1541   {
1542     // UnsafeMemoryAccess page error: continue after unsafe access
1543     UnsafeMemoryAccessMark umam(this, !aligned, true);
1544     // 'from', 'to' and 'count' are now valid
1545     __ movptr(byte_count, count);
1546     __ shrptr(count, 3); // count => qword_count
1547 
1548     // Copy from low to high addresses.  Use 'to' as scratch.
1549     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1550     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1551     __ negptr(qword_count); // make the count negative
1552     __ jmp(L_copy_bytes);
1553 
1554     // Copy trailing qwords
1555   __ BIND(L_copy_8_bytes);
1556     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1557     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1558     __ increment(qword_count);
1559     __ jcc(Assembler::notZero, L_copy_8_bytes);
1560 
1561     // Check for and copy trailing dword
1562   __ BIND(L_copy_4_bytes);
1563     __ testl(byte_count, 4);
1564     __ jccb(Assembler::zero, L_copy_2_bytes);
1565     __ movl(rax, Address(end_from, 8));
1566     __ movl(Address(end_to, 8), rax);
1567 
1568     __ addptr(end_from, 4);
1569     __ addptr(end_to, 4);
1570 
1571     // Check for and copy trailing word
1572   __ BIND(L_copy_2_bytes);
1573     __ testl(byte_count, 2);
1574     __ jccb(Assembler::zero, L_copy_byte);
1575     __ movw(rax, Address(end_from, 8));
1576     __ movw(Address(end_to, 8), rax);
1577 
1578     __ addptr(end_from, 2);
1579     __ addptr(end_to, 2);
1580 
1581     // Check for and copy trailing byte
1582   __ BIND(L_copy_byte);
1583     __ testl(byte_count, 1);
1584     __ jccb(Assembler::zero, L_exit);
1585     __ movb(rax, Address(end_from, 8));
1586     __ movb(Address(end_to, 8), rax);
1587   }
1588 __ BIND(L_exit);
1589   address ucme_exit_pc = __ pc();
1590   restore_arg_regs();
1591   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1592   __ xorptr(rax, rax); // return 0
1593   __ vzeroupper();
1594   __ leave(); // required for proper stackwalking of RuntimeStub frame
1595   __ ret(0);
1596 
1597   {
1598     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1599     // Copy in multi-bytes chunks
1600     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1601     __ jmp(L_copy_4_bytes);
1602   }
1603 
1604   // retrieve the registered handler addresses
1605   address end = __ pc();
1606   retrieve_unsafe_access_handlers(start, end, extras);
1607   assert(extras.length() == expected_handler_count,
1608          "unexpected handler addresses count %d", extras.length());
1609 
1610   // record the stub entry and end plus the no_push entry and any
1611   // extra handler addresses
1612   store_archive_data(stub_id, start, end, entries_ptr, &extras);
1613 
1614   return start;
1615 }
1616 
1617 
1618 // Arguments:
1619 //   entry     - location for return of (post-push) entry
1620 //   nooverlap_target - entry to branch to if no overlap detected
1621 //
1622 // Inputs:
1623 //   c_rarg0   - source array address
1624 //   c_rarg1   - destination array address
1625 //   c_rarg2   - element count, treated as ssize_t, can be zero
1626 //
1627 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1628 // we let the hardware handle it.  The one to eight bytes within words,
1629 // dwords or qwords that span cache line boundaries will still be loaded
1630 // and stored atomically.
1631 //
1632 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) {
1633   StubId stub_id = StubId::stubgen_jbyte_arraycopy_id;
1634   // aligned is always false -- x86_64 always uses the unaligned code
1635   const bool aligned = false;
1636 #if COMPILER2_OR_JVMCI
1637   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1638     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1639   }
1640 #endif
1641   GrowableArray<address> entries;
1642   GrowableArray<address> extras;
1643   int expected_entry_count = (entry != nullptr ? 2 : 1);
1644   int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1645   int entry_count = StubInfo::entry_count(stub_id);
1646   assert(entry_count == expected_entry_count, "sanity check");
1647   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1648   address start = load_archive_data(stub_id, entries_ptr, &extras);
1649   if (start != nullptr) {
1650     assert(entries.length() == expected_entry_count - 1,
1651            "unexpected entry count %d", entries.length());
1652     assert(extras.length() == expected_handler_count,
1653            "unexpected handler addresses count %d", extras.length());
1654     if (entry != nullptr) {
1655       *entry = entries.at(0);
1656     }
1657     // restore 2 UMAM {start,end,handler} addresses from extras
1658     register_unsafe_access_handlers(extras, 0, 2);
1659     return start;
1660   }
1661   __ align(CodeEntryAlignment);
1662   StubCodeMark mark(this, stub_id);
1663   start = __ pc();
1664   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1665 
1666   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1667   const Register from        = rdi;  // source array address
1668   const Register to          = rsi;  // destination array address
1669   const Register count       = rdx;  // elements count
1670   const Register byte_count  = rcx;
1671   const Register qword_count = count;
1672 
1673   __ enter(); // required for proper stackwalking of RuntimeStub frame
1674   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1675 
1676   if (entry != nullptr) {
1677     *entry = __ pc();
1678     entries.append(*entry);
1679     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1680     BLOCK_COMMENT("Entry:");
1681   }
1682 
1683   array_overlap_test(nooverlap_target, Address::times_1);
1684   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1685                     // r9 and r10 may be used to save non-volatile registers
1686 
1687   {
1688     // UnsafeMemoryAccess page error: continue after unsafe access
1689     UnsafeMemoryAccessMark umam(this, !aligned, true);
1690     // 'from', 'to' and 'count' are now valid
1691     __ movptr(byte_count, count);
1692     __ shrptr(count, 3);   // count => qword_count
1693 
1694     // Copy from high to low addresses.
1695 
1696     // Check for and copy trailing byte
1697     __ testl(byte_count, 1);
1698     __ jcc(Assembler::zero, L_copy_2_bytes);
1699     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1700     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1701     __ decrement(byte_count); // Adjust for possible trailing word
1702 
1703     // Check for and copy trailing word
1704   __ BIND(L_copy_2_bytes);
1705     __ testl(byte_count, 2);
1706     __ jcc(Assembler::zero, L_copy_4_bytes);
1707     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1708     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1709 
1710     // Check for and copy trailing dword
1711   __ BIND(L_copy_4_bytes);
1712     __ testl(byte_count, 4);
1713     __ jcc(Assembler::zero, L_copy_bytes);
1714     __ movl(rax, Address(from, qword_count, Address::times_8));
1715     __ movl(Address(to, qword_count, Address::times_8), rax);
1716     __ jmp(L_copy_bytes);
1717 
1718     // Copy trailing qwords
1719   __ BIND(L_copy_8_bytes);
1720     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1721     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1722     __ decrement(qword_count);
1723     __ jcc(Assembler::notZero, L_copy_8_bytes);
1724   }
1725   restore_arg_regs();
1726   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1727   __ xorptr(rax, rax); // return 0
1728   __ vzeroupper();
1729   __ leave(); // required for proper stackwalking of RuntimeStub frame
1730   __ ret(0);
1731 
1732   {
1733     // UnsafeMemoryAccess page error: continue after unsafe access
1734     UnsafeMemoryAccessMark umam(this, !aligned, true);
1735     // Copy in multi-bytes chunks
1736     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1737   }
1738   restore_arg_regs();
1739   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1740   __ xorptr(rax, rax); // return 0
1741   __ vzeroupper();
1742   __ leave(); // required for proper stackwalking of RuntimeStub frame
1743   __ ret(0);
1744 
1745   // retrieve the registered handler addresses
1746   address end = __ pc();
1747   retrieve_unsafe_access_handlers(start, end, extras);
1748   assert(extras.length() == expected_handler_count,
1749          "unexpected handler addresses count %d", extras.length());
1750 
1751   // record the stub entry and end plus the no_push entry and any
1752   // extra handler addresses
1753   store_archive_data(stub_id, start, end, entries_ptr, &extras);
1754 
1755   return start;
1756 }
1757 
1758 
1759 // Arguments:
1760 //   entry     - location for return of (post-push) entry
1761 //
1762 // Inputs:
1763 //   c_rarg0   - source array address
1764 //   c_rarg1   - destination array address
1765 //   c_rarg2   - element count, treated as ssize_t, can be zero
1766 //
1767 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1768 // let the hardware handle it.  The two or four words within dwords
1769 // or qwords that span cache line boundaries will still be loaded
1770 // and stored atomically.
1771 //
1772 // Side Effects:
1773 //   entry is set to the no-overlap entry point
1774 //   used by generate_conjoint_short_copy().
1775 //
1776 address StubGenerator::generate_disjoint_short_copy(address *entry) {
1777   StubId stub_id = StubId::stubgen_jshort_disjoint_arraycopy_id;
1778   // aligned is always false -- x86_64 always uses the unaligned code
1779   const bool aligned = false;
1780 #if COMPILER2_OR_JVMCI
1781   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1782     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1783   }
1784 #endif
1785   GrowableArray<address> entries;
1786   GrowableArray<address> extras;
1787   int expected_entry_count = (entry != nullptr ? 2 : 1);
1788   int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1789   int entry_count = StubInfo::entry_count(stub_id);
1790   assert(entry_count == expected_entry_count, "sanity check");
1791   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1792   address start = load_archive_data(stub_id, entries_ptr, &extras);
1793   if (start != nullptr) {
1794     assert(entries.length() == expected_entry_count - 1,
1795            "unexpected entry count %d", entries.length());
1796     assert(extras.length() == expected_handler_count,
1797            "unexpected handler addresses count %d", extras.length());
1798     if (entry != nullptr) {
1799       *entry = entries.at(0);
1800     }
1801     // restore 2 UMAM {start,end,handler} addresses from extras
1802     register_unsafe_access_handlers(extras, 0, 2);
1803     return start;
1804   }
1805   __ align(CodeEntryAlignment);
1806   StubCodeMark mark(this, stub_id);
1807   start = __ pc();
1808   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1809 
1810   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1811   const Register from        = rdi;  // source array address
1812   const Register to          = rsi;  // destination array address
1813   const Register count       = rdx;  // elements count
1814   const Register word_count  = rcx;
1815   const Register qword_count = count;
1816   const Register end_from    = from; // source array end address
1817   const Register end_to      = to;   // destination array end address
1818   // End pointers are inclusive, and if count is not zero they point
1819   // to the last unit copied:  end_to[0] := end_from[0]
1820 
1821   __ enter(); // required for proper stackwalking of RuntimeStub frame
1822   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1823 
1824   if (entry != nullptr) {
1825     *entry = __ pc();
1826     entries.append(*entry);
1827     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1828     BLOCK_COMMENT("Entry:");
1829   }
1830 
1831   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1832                     // r9 and r10 may be used to save non-volatile registers
1833 
1834   {
1835     // UnsafeMemoryAccess page error: continue after unsafe access
1836     UnsafeMemoryAccessMark umam(this, !aligned, true);
1837     // 'from', 'to' and 'count' are now valid
1838     __ movptr(word_count, count);
1839     __ shrptr(count, 2); // count => qword_count
1840 
1841     // Copy from low to high addresses.  Use 'to' as scratch.
1842     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1843     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1844     __ negptr(qword_count);
1845     __ jmp(L_copy_bytes);
1846 
1847     // Copy trailing qwords
1848   __ BIND(L_copy_8_bytes);
1849     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1850     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1851     __ increment(qword_count);
1852     __ jcc(Assembler::notZero, L_copy_8_bytes);
1853 
1854     // Original 'dest' is trashed, so we can't use it as a
1855     // base register for a possible trailing word copy
1856 
1857     // Check for and copy trailing dword
1858   __ BIND(L_copy_4_bytes);
1859     __ testl(word_count, 2);
1860     __ jccb(Assembler::zero, L_copy_2_bytes);
1861     __ movl(rax, Address(end_from, 8));
1862     __ movl(Address(end_to, 8), rax);
1863 
1864     __ addptr(end_from, 4);
1865     __ addptr(end_to, 4);
1866 
1867     // Check for and copy trailing word
1868   __ BIND(L_copy_2_bytes);
1869     __ testl(word_count, 1);
1870     __ jccb(Assembler::zero, L_exit);
1871     __ movw(rax, Address(end_from, 8));
1872     __ movw(Address(end_to, 8), rax);
1873   }
1874 __ BIND(L_exit);
1875   address ucme_exit_pc = __ pc();
1876   restore_arg_regs();
1877   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1878   __ xorptr(rax, rax); // return 0
1879   __ vzeroupper();
1880   __ leave(); // required for proper stackwalking of RuntimeStub frame
1881   __ ret(0);
1882 
1883   {
1884     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1885     // Copy in multi-bytes chunks
1886     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1887     __ jmp(L_copy_4_bytes);
1888   }
1889 
1890   // retrieve the registered handler addresses
1891   address end = __ pc();
1892   retrieve_unsafe_access_handlers(start, end, extras);
1893   assert(extras.length() == expected_handler_count,
1894          "unexpected handler addresses count %d", extras.length());
1895 
1896   // record the stub entry and end plus the no_push entry and any
1897   // extra handler addresses
1898   store_archive_data(stub_id, start, end, entries_ptr, &extras);
1899 
1900   return start;
1901 }
1902 
1903 
1904 address StubGenerator::generate_fill(StubId stub_id) {
1905   BasicType t;
1906   bool aligned;
1907   switch (stub_id) {
1908   case StubId::stubgen_jbyte_fill_id:
1909     t = T_BYTE;
1910     aligned = false;
1911     break;
1912   case StubId::stubgen_jshort_fill_id:
1913     t = T_SHORT;
1914     aligned = false;
1915     break;
1916   case StubId::stubgen_jint_fill_id:
1917     t = T_INT;
1918     aligned = false;
1919     break;
1920   case StubId::stubgen_arrayof_jbyte_fill_id:
1921     t = T_BYTE;
1922     aligned = true;
1923     break;
1924   case StubId::stubgen_arrayof_jshort_fill_id:
1925     t = T_SHORT;
1926     aligned = true;
1927     break;
1928   case StubId::stubgen_arrayof_jint_fill_id:
1929     t = T_INT;
1930     aligned = true;
1931     break;
1932   default:
1933     ShouldNotReachHere();
1934   }
1935   int entry_count = StubInfo::entry_count(stub_id);
1936   assert(entry_count == 1, "sanity check");
1937   GrowableArray<address> extras;
1938   bool add_handlers = ((t == T_BYTE) && !aligned);
1939   int handlers_count = (add_handlers ? 1 : 0);
1940   int expected_extras_count = (handlers_count * UnsafeMemoryAccess::COLUMN_COUNT); // 0/1 x UMAM {start,end,handler}
1941   GrowableArray<address>* extras_ptr = (add_handlers ? &extras : nullptr);
1942   address start = load_archive_data(stub_id, nullptr, extras_ptr);
1943   if (start != nullptr) {
1944     assert(extras.length() == expected_extras_count,
1945            "unexpected handler addresses count %d", extras.length());
1946     if (add_handlers) {
1947       // restore 1 x UMAM {start,end,handler} addresses from extras
1948       register_unsafe_access_handlers(extras, 0, 1);
1949     }
1950     return start;
1951   }
1952 
1953   __ align(CodeEntryAlignment);
1954   StubCodeMark mark(this, stub_id);
1955   start = __ pc();
1956 
1957   BLOCK_COMMENT("Entry:");
1958 
1959   const Register to       = c_rarg0;  // destination array address
1960   const Register value    = c_rarg1;  // value
1961   const Register count    = c_rarg2;  // elements count
1962   __ mov(r11, count);
1963 
1964   __ enter(); // required for proper stackwalking of RuntimeStub frame
1965 
1966   {
1967     // Add set memory mark to protect against unsafe accesses faulting
1968     UnsafeMemoryAccessMark umam(this, add_handlers, true);
1969     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1970   }
1971 
1972   __ vzeroupper();
1973   __ leave(); // required for proper stackwalking of RuntimeStub frame
1974   __ ret(0);
1975 
1976   address end = __ pc();
1977   if (add_handlers) {
1978     retrieve_unsafe_access_handlers(start, end, extras);
1979   }
1980   assert(extras.length() == expected_extras_count,
1981          "unexpected handler addresses count %d", extras.length());
1982   // record the stub entry and end
1983   store_archive_data(stub_id, start, end, nullptr, extras_ptr);
1984 
1985   return start;
1986 }
1987 
1988 
1989 // Arguments:
1990 //   entry     - location for return of (post-push) entry
1991 //   nooverlap_target - entry to branch to if no overlap detected
1992 //
1993 // Inputs:
1994 //   c_rarg0   - source array address
1995 //   c_rarg1   - destination array address
1996 //   c_rarg2   - element count, treated as ssize_t, can be zero
1997 //
1998 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1999 // let the hardware handle it.  The two or four words within dwords
2000 // or qwords that span cache line boundaries will still be loaded
2001 // and stored atomically.
2002 //
2003 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) {
2004   StubId stub_id = StubId::stubgen_jshort_arraycopy_id;
2005   // aligned is always false -- x86_64 always uses the unaligned code
2006   const bool aligned = false;
2007 #if COMPILER2_OR_JVMCI
2008   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2009     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2010   }
2011 #endif
2012   GrowableArray<address> entries;
2013   GrowableArray<address> extras;
2014   int expected_entry_count = (entry != nullptr ? 2 : 1);
2015   int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
2016   int entry_count = StubInfo::entry_count(stub_id);
2017   assert(entry_count == expected_entry_count, "sanity check");
2018   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2019   address start = load_archive_data(stub_id, entries_ptr, &extras);
2020   if (start != nullptr) {
2021     assert(entries.length() == expected_entry_count - 1,
2022            "unexpected entry count %d", entries.length());
2023     assert(extras.length() == expected_handler_count,
2024            "unexpected handler addresses count %d", extras.length());
2025     if (entry != nullptr) {
2026       *entry = entries.at(0);
2027     }
2028     // restore 2 UMAM {start,end,handler} addresses from extras
2029     register_unsafe_access_handlers(extras, 0, 2);
2030     return start;
2031   }
2032   __ align(CodeEntryAlignment);
2033   StubCodeMark mark(this, stub_id);
2034   start = __ pc();
2035   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2036 
2037   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2038   const Register from        = rdi;  // source array address
2039   const Register to          = rsi;  // destination array address
2040   const Register count       = rdx;  // elements count
2041   const Register word_count  = rcx;
2042   const Register qword_count = count;
2043 
2044   __ enter(); // required for proper stackwalking of RuntimeStub frame
2045   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2046 
2047   if (entry != nullptr) {
2048     *entry = __ pc();
2049     entries.append(*entry);
2050     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2051     BLOCK_COMMENT("Entry:");
2052   }
2053 
2054   array_overlap_test(nooverlap_target, Address::times_2);
2055   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2056                     // r9 and r10 may be used to save non-volatile registers
2057 
2058   {
2059     // UnsafeMemoryAccess page error: continue after unsafe access
2060     UnsafeMemoryAccessMark umam(this, !aligned, true);
2061     // 'from', 'to' and 'count' are now valid
2062     __ movptr(word_count, count);
2063     __ shrptr(count, 2); // count => qword_count
2064 
2065     // Copy from high to low addresses.  Use 'to' as scratch.
2066 
2067     // Check for and copy trailing word
2068     __ testl(word_count, 1);
2069     __ jccb(Assembler::zero, L_copy_4_bytes);
2070     __ movw(rax, Address(from, word_count, Address::times_2, -2));
2071     __ movw(Address(to, word_count, Address::times_2, -2), rax);
2072 
2073    // Check for and copy trailing dword
2074   __ BIND(L_copy_4_bytes);
2075     __ testl(word_count, 2);
2076     __ jcc(Assembler::zero, L_copy_bytes);
2077     __ movl(rax, Address(from, qword_count, Address::times_8));
2078     __ movl(Address(to, qword_count, Address::times_8), rax);
2079     __ jmp(L_copy_bytes);
2080 
2081     // Copy trailing qwords
2082   __ BIND(L_copy_8_bytes);
2083     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2084     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2085     __ decrement(qword_count);
2086     __ jcc(Assembler::notZero, L_copy_8_bytes);
2087   }
2088   restore_arg_regs();
2089   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2090   __ xorptr(rax, rax); // return 0
2091   __ vzeroupper();
2092   __ leave(); // required for proper stackwalking of RuntimeStub frame
2093   __ ret(0);
2094 
2095   {
2096     // UnsafeMemoryAccess page error: continue after unsafe access
2097     UnsafeMemoryAccessMark umam(this, !aligned, true);
2098     // Copy in multi-bytes chunks
2099     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
2100   }
2101   restore_arg_regs();
2102   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2103   __ xorptr(rax, rax); // return 0
2104   __ vzeroupper();
2105   __ leave(); // required for proper stackwalking of RuntimeStub frame
2106   __ ret(0);
2107 
2108   // retrieve the registered handler addresses
2109   address end = __ pc();
2110   retrieve_unsafe_access_handlers(start, end, extras);
2111   assert(extras.length() == expected_handler_count,
2112          "unexpected handler addresses count %d", extras.length());
2113 
2114   // record the stub entry and end plus the no_push entry and any
2115   // extra handler addresses
2116   store_archive_data(stub_id, start, end, entries_ptr, &extras);
2117 
2118   return start;
2119 }
2120 
2121 
2122 // Arguments:
2123 //   stub_id   - unqiue id for stub to generate
2124 //   entry     - location for return of (post-push) entry
2125 //   is_oop    - true => oop array, so generate store check code
2126 //
2127 // Inputs:
2128 //   c_rarg0   - source array address
2129 //   c_rarg1   - destination array address
2130 //   c_rarg2   - element count, treated as ssize_t, can be zero
2131 //
2132 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2133 // the hardware handle it.  The two dwords within qwords that span
2134 // cache line boundaries will still be loaded and stored atomically.
2135 //
2136 // Side Effects:
2137 //   disjoint_int_copy_entry is set to the no-overlap entry point
2138 //   used by generate_conjoint_int_oop_copy().
2139 //
2140 address StubGenerator::generate_disjoint_int_oop_copy(StubId stub_id, address* entry) {
2141   // aligned is always false -- x86_64 always uses the unaligned code
2142   const bool aligned = false;
2143   bool is_oop;
2144   bool dest_uninitialized;
2145   switch (stub_id) {
2146   case StubId::stubgen_jint_disjoint_arraycopy_id:
2147     is_oop = false;
2148     dest_uninitialized = false;
2149     break;
2150   case StubId::stubgen_oop_disjoint_arraycopy_id:
2151     assert(UseCompressedOops, "inconsistent oop copy size!");
2152     is_oop = true;
2153     dest_uninitialized = false;
2154     break;
2155   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2156     assert(UseCompressedOops, "inconsistent oop copy size!");
2157     is_oop = true;
2158     dest_uninitialized = true;
2159     break;
2160   default:
2161     ShouldNotReachHere();
2162   }
2163 
2164   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2165 #if COMPILER2_OR_JVMCI
2166   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2167     return generate_disjoint_copy_avx3_masked(stub_id, entry);
2168   }
2169 #endif
2170   GrowableArray<address> entries;
2171   GrowableArray<address> extras;
2172   bool add_handlers = !is_oop && !aligned;
2173   bool add_relocs = UseZGC && is_oop;
2174   bool add_extras = add_handlers || add_relocs;
2175   int expected_entry_count = (entry != nullptr ? 2 : 1);
2176   int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2177   int entry_count = StubInfo::entry_count(stub_id);
2178   assert(entry_count == expected_entry_count, "sanity check");
2179   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2180   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2181   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2182   if (start != nullptr) {
2183     assert(entries.length() == expected_entry_count - 1,
2184            "unexpected entry count %d", entries.length());
2185     assert(!add_handlers || extras.length() == expected_handler_count,
2186            "unexpected handler addresses count %d", extras.length());
2187     if (entry != nullptr) {
2188       *entry = entries.at(0);
2189     }
2190     if (add_handlers) {
2191       // restore 2 UMAM {start,end,handler} addresses from extras
2192       register_unsafe_access_handlers(extras, 0, 2);
2193     }
2194 #if INCLUDE_ZGC
2195     // register addresses at which ZGC does colour patching
2196     if (add_relocs)  {
2197       register_reloc_addresses(extras, 0, extras.length());
2198     }
2199 #endif // INCLUDE_ZGC
2200     return start;
2201   }
2202 
2203   __ align(CodeEntryAlignment);
2204   StubCodeMark mark(this, stub_id);
2205   start = __ pc();
2206 
2207   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2208   const Register from        = rdi;  // source array address
2209   const Register to          = rsi;  // destination array address
2210   const Register count       = rdx;  // elements count
2211   const Register dword_count = rcx;
2212   const Register qword_count = count;
2213   const Register end_from    = from; // source array end address
2214   const Register end_to      = to;   // destination array end address
2215   // End pointers are inclusive, and if count is not zero they point
2216   // to the last unit copied:  end_to[0] := end_from[0]
2217 
2218   __ enter(); // required for proper stackwalking of RuntimeStub frame
2219   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2220 
2221   if (entry != nullptr) {
2222     *entry = __ pc();
2223     entries.append(*entry);
2224     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2225     BLOCK_COMMENT("Entry:");
2226   }
2227 
2228   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2229                                  // r9 is used to save r15_thread
2230 
2231   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2232   if (dest_uninitialized) {
2233     decorators |= IS_DEST_UNINITIALIZED;
2234   }
2235   if (aligned) {
2236     decorators |= ARRAYCOPY_ALIGNED;
2237   }
2238 
2239   BasicType type = is_oop ? T_OBJECT : T_INT;
2240   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2241 
2242   {
2243     // UnsafeMemoryAccess page error: continue after unsafe access
2244     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2245     // 'from', 'to' and 'count' are now valid
2246     __ movptr(dword_count, count);
2247     __ shrptr(count, 1); // count => qword_count
2248 
2249     // Copy from low to high addresses.  Use 'to' as scratch.
2250     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2251     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2252     __ negptr(qword_count);
2253     __ jmp(L_copy_bytes);
2254 
2255     // Copy trailing qwords
2256     __ BIND(L_copy_8_bytes);
2257     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2258     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2259     __ increment(qword_count);
2260     __ jcc(Assembler::notZero, L_copy_8_bytes);
2261 
2262     // Check for and copy trailing dword
2263     __ BIND(L_copy_4_bytes);
2264     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2265     __ jccb(Assembler::zero, L_exit);
2266     __ movl(rax, Address(end_from, 8));
2267     __ movl(Address(end_to, 8), rax);
2268   }
2269   __ BIND(L_exit);
2270   address ucme_exit_pc = __ pc();
2271   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2272   restore_arg_regs_using_thread();
2273   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2274   __ vzeroupper();
2275   __ xorptr(rax, rax); // return 0
2276   __ leave(); // required for proper stackwalking of RuntimeStub frame
2277   __ ret(0);
2278 
2279   {
2280     UnsafeMemoryAccessMark umam(this, add_handlers, false, ucme_exit_pc);
2281     // Copy in multi-bytes chunks
2282     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2283     __ jmp(L_copy_4_bytes);
2284   }
2285 
2286   // retrieve the registered handler addresses
2287   address end = __ pc();
2288   if (add_handlers) {
2289     retrieve_unsafe_access_handlers(start, end, extras);
2290   }
2291   assert(extras.length() == expected_handler_count,
2292          "unexpected handler addresses count %d", extras.length());
2293 #if INCLUDE_ZGC
2294   // retrieve addresses at which ZGC does colour patching
2295   if (add_relocs) {
2296     retrieve_reloc_addresses(start, end, extras);
2297   }
2298 #endif // INCLUDE_ZGC
2299 
2300   // record the stub entry and end plus the no_push entry and any
2301   // extra handler addresses
2302   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2303 
2304   return start;
2305 }
2306 
2307 
2308 // Arguments:
2309 //   entry     - location for return of (post-push) entry
2310 //   nooverlap_target - entry to branch to if no overlap detected
2311 //   is_oop  - true => oop array, so generate store check code
2312 //
2313 // Inputs:
2314 //   c_rarg0   - source array address
2315 //   c_rarg1   - destination array address
2316 //   c_rarg2   - element count, treated as ssize_t, can be zero
2317 //
2318 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2319 // the hardware handle it.  The two dwords within qwords that span
2320 // cache line boundaries will still be loaded and stored atomically.
2321 //
2322 address StubGenerator::generate_conjoint_int_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2323   // aligned is always false -- x86_64 always uses the unaligned code
2324   const bool aligned = false;
2325   bool is_oop;
2326   bool dest_uninitialized;
2327   switch (stub_id) {
2328   case StubId::stubgen_jint_arraycopy_id:
2329     is_oop = false;
2330     dest_uninitialized = false;
2331     break;
2332   case StubId::stubgen_oop_arraycopy_id:
2333     assert(UseCompressedOops, "inconsistent oop copy size!");
2334     is_oop = true;
2335     dest_uninitialized = false;
2336     break;
2337   case StubId::stubgen_oop_arraycopy_uninit_id:
2338     assert(UseCompressedOops, "inconsistent oop copy size!");
2339     is_oop = true;
2340     dest_uninitialized = true;
2341     break;
2342   default:
2343     ShouldNotReachHere();
2344   }
2345 
2346   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2347 #if COMPILER2_OR_JVMCI
2348   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2349     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2350   }
2351 #endif
2352   bool add_handlers = !is_oop && !aligned;
2353   bool add_relocs = UseZGC && is_oop;
2354   bool add_extras = add_handlers || add_relocs;
2355   GrowableArray<address> entries;
2356   GrowableArray<address> extras;
2357   int expected_entry_count = (entry != nullptr ? 2 : 1);
2358   int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2359   int entry_count = StubInfo::entry_count(stub_id);
2360   assert(entry_count == expected_entry_count, "sanity check");
2361   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2362   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2363   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2364   if (start != nullptr) {
2365     assert(entries.length() == expected_entry_count - 1,
2366            "unexpected entry count %d", entries.length());
2367     assert(!add_handlers || extras.length() == expected_handler_count,
2368            "unexpected handler addresses count %d", extras.length());
2369     if (entry != nullptr) {
2370       *entry = entries.at(0);
2371     }
2372     if (add_handlers) {
2373       // restore 2 UMAM {start,end,handler} addresses from extras
2374       register_unsafe_access_handlers(extras, 0, 2);
2375     }
2376 #if INCLUDE_ZGC
2377     // register addresses at which ZGC does colour patching
2378     if (add_relocs)  {
2379       register_reloc_addresses(extras, 6, extras.length());
2380     }
2381 #endif // INCLUDE_ZGC
2382     return start;
2383   }
2384 
2385   __ align(CodeEntryAlignment);
2386   StubCodeMark mark(this, stub_id);
2387   start = __ pc();
2388 
2389   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2390   const Register from        = rdi;  // source array address
2391   const Register to          = rsi;  // destination array address
2392   const Register count       = rdx;  // elements count
2393   const Register dword_count = rcx;
2394   const Register qword_count = count;
2395 
2396   __ enter(); // required for proper stackwalking of RuntimeStub frame
2397   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2398 
2399   if (entry != nullptr) {
2400     *entry = __ pc();
2401     entries.append(*entry);
2402     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2403     BLOCK_COMMENT("Entry:");
2404   }
2405 
2406   array_overlap_test(nooverlap_target, Address::times_4);
2407   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2408                                  // r9 is used to save r15_thread
2409 
2410   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2411   if (dest_uninitialized) {
2412     decorators |= IS_DEST_UNINITIALIZED;
2413   }
2414   if (aligned) {
2415     decorators |= ARRAYCOPY_ALIGNED;
2416   }
2417 
2418   BasicType type = is_oop ? T_OBJECT : T_INT;
2419   // no registers are destroyed by this call
2420   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2421 
2422   assert_clean_int(count, rax); // Make sure 'count' is clean int.
2423   {
2424     // UnsafeMemoryAccess page error: continue after unsafe access
2425     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2426     // 'from', 'to' and 'count' are now valid
2427     __ movptr(dword_count, count);
2428     __ shrptr(count, 1); // count => qword_count
2429 
2430     // Copy from high to low addresses.  Use 'to' as scratch.
2431 
2432     // Check for and copy trailing dword
2433     __ testl(dword_count, 1);
2434     __ jcc(Assembler::zero, L_copy_bytes);
2435     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2436     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2437     __ jmp(L_copy_bytes);
2438 
2439     // Copy trailing qwords
2440     __ BIND(L_copy_8_bytes);
2441     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2442     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2443     __ decrement(qword_count);
2444     __ jcc(Assembler::notZero, L_copy_8_bytes);
2445   }
2446   if (is_oop) {
2447     __ jmp(L_exit);
2448   }
2449   restore_arg_regs_using_thread();
2450   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2451   __ xorptr(rax, rax); // return 0
2452   __ vzeroupper();
2453   __ leave(); // required for proper stackwalking of RuntimeStub frame
2454   __ ret(0);
2455 
2456   {
2457     // UnsafeMemoryAccess page error: continue after unsafe access
2458     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2459     // Copy in multi-bytes chunks
2460     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2461   }
2462 
2463   __ BIND(L_exit);
2464   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2465   restore_arg_regs_using_thread();
2466   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2467   __ xorptr(rax, rax); // return 0
2468   __ vzeroupper();
2469   __ leave(); // required for proper stackwalking of RuntimeStub frame
2470   __ ret(0);
2471 
2472   // retrieve the registered handler addresses
2473   address end = __ pc();
2474   if (add_handlers) {
2475     retrieve_unsafe_access_handlers(start, end, extras);
2476   }
2477   assert(extras.length() == expected_handler_count,
2478          "unexpected handler addresses count %d", extras.length());
2479 #if INCLUDE_ZGC
2480   // retrieve addresses at which ZGC does colour patching
2481   if (add_relocs) {
2482     retrieve_reloc_addresses(start, end, extras);
2483   }
2484 #endif // INCLUDE_ZGC
2485   // record the stub entry and end plus the no_push entry and any
2486   // extra handler addresses
2487   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2488 
2489   return start;
2490 }
2491 
2492 
2493 // Arguments:
2494 //   entry     - location for return of (post-push) entry
2495 //
2496 // Inputs:
2497 //   c_rarg0   - source array address
2498 //   c_rarg1   - destination array address
2499 //   c_rarg2   - element count, treated as ssize_t, can be zero
2500 //
2501  // Side Effects:
2502 //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2503 //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2504 //
2505 address StubGenerator::generate_disjoint_long_oop_copy(StubId stub_id, address *entry) {
2506   // aligned is always false -- x86_64 always uses the unaligned code
2507   const bool aligned = false;
2508   bool is_oop;
2509   bool dest_uninitialized;
2510   switch (stub_id) {
2511   case StubId::stubgen_jlong_disjoint_arraycopy_id:
2512     is_oop = false;
2513     dest_uninitialized = false;
2514     break;
2515   case StubId::stubgen_oop_disjoint_arraycopy_id:
2516     assert(!UseCompressedOops, "inconsistent oop copy size!");
2517     is_oop = true;
2518     dest_uninitialized = false;
2519     break;
2520   case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2521     assert(!UseCompressedOops, "inconsistent oop copy size!");
2522     is_oop = true;
2523     dest_uninitialized = true;
2524     break;
2525   default:
2526     ShouldNotReachHere();
2527   }
2528 
2529   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2530 #if COMPILER2_OR_JVMCI
2531   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2532     return generate_disjoint_copy_avx3_masked(stub_id, entry);
2533   }
2534 #endif
2535   bool add_handlers = !is_oop && !aligned;
2536   bool add_relocs = UseZGC && is_oop;
2537   bool add_extras = add_handlers || add_relocs;
2538   GrowableArray<address> entries;
2539   GrowableArray<address> extras;
2540   int expected_entry_count = (entry != nullptr ? 2 : 1);
2541   int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2542   int entry_count = StubInfo::entry_count(stub_id);
2543   assert(entry_count == expected_entry_count, "sanity check");
2544   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2545   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2546   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2547   if (start != nullptr) {
2548     assert(entries.length() == expected_entry_count - 1,
2549            "unexpected entry count %d", entries.length());
2550     assert(!add_handlers || extras.length() == expected_handler_count,
2551            "unexpected handler addresses count %d", extras.length());
2552     if (entry != nullptr) {
2553       *entry = entries.at(0);
2554     }
2555     if (add_handlers) {
2556       // restore 2 UMAM {start,end,handler} addresses from extras
2557       register_unsafe_access_handlers(extras, 0, 2);
2558     }
2559 #if INCLUDE_ZGC
2560     // register addresses at which ZGC does colour patching
2561     if (add_relocs)  {
2562       register_reloc_addresses(extras, 0, extras.length());
2563     }
2564 #endif // INCLUDE_ZGC
2565     return start;
2566   }
2567 
2568   __ align(CodeEntryAlignment);
2569   StubCodeMark mark(this, stub_id);
2570   start = __ pc();
2571 
2572   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2573   const Register from        = rdi;  // source array address
2574   const Register to          = rsi;  // destination array address
2575   const Register qword_count = rdx;  // elements count
2576   const Register end_from    = from; // source array end address
2577   const Register end_to      = rcx;  // destination array end address
2578   const Register saved_count = r11;
2579   // End pointers are inclusive, and if count is not zero they point
2580   // to the last unit copied:  end_to[0] := end_from[0]
2581 
2582   __ enter(); // required for proper stackwalking of RuntimeStub frame
2583   // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2584   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2585 
2586   if (entry != nullptr) {
2587     *entry = __ pc();
2588     entries.append(*entry);
2589     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2590     BLOCK_COMMENT("Entry:");
2591   }
2592 
2593   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2594                                    // r9 is used to save r15_thread
2595   // 'from', 'to' and 'qword_count' are now valid
2596 
2597   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2598   if (dest_uninitialized) {
2599     decorators |= IS_DEST_UNINITIALIZED;
2600   }
2601   if (aligned) {
2602     decorators |= ARRAYCOPY_ALIGNED;
2603   }
2604 
2605   BasicType type = is_oop ? T_OBJECT : T_LONG;
2606   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2607   {
2608     // UnsafeMemoryAccess page error: continue after unsafe access
2609     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2610 
2611     // Copy from low to high addresses.  Use 'to' as scratch.
2612     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2613     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2614     __ negptr(qword_count);
2615     __ jmp(L_copy_bytes);
2616 
2617     // Copy trailing qwords
2618   __ BIND(L_copy_8_bytes);
2619     bs->copy_load_at(_masm, decorators, type, 8,
2620                      rax, Address(end_from, qword_count, Address::times_8, 8),
2621                      r10);
2622     bs->copy_store_at(_masm, decorators, type, 8,
2623                       Address(end_to, qword_count, Address::times_8, 8), rax,
2624                       r10);
2625     __ increment(qword_count);
2626     __ jcc(Assembler::notZero, L_copy_8_bytes);
2627   }
2628   if (is_oop) {
2629     __ jmp(L_exit);
2630   } else {
2631     restore_arg_regs_using_thread();
2632     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2633     __ xorptr(rax, rax); // return 0
2634     __ vzeroupper();
2635     __ leave(); // required for proper stackwalking of RuntimeStub frame
2636     __ ret(0);
2637   }
2638 
2639   {
2640     // UnsafeMemoryAccess page error: continue after unsafe access
2641     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2642     // Copy in multi-bytes chunks
2643     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2644   }
2645 
2646   __ BIND(L_exit);
2647   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2648   restore_arg_regs_using_thread();
2649   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2650                           SharedRuntime::_jlong_array_copy_ctr,
2651                  rscratch1); // Update counter after rscratch1 is free
2652   __ vzeroupper();
2653   __ xorptr(rax, rax); // return 0
2654   __ leave(); // required for proper stackwalking of RuntimeStub frame
2655   __ ret(0);
2656 
2657   // retrieve the registered handler addresses
2658   address end = __ pc();
2659   if (add_handlers) {
2660     retrieve_unsafe_access_handlers(start, end, extras);
2661   }
2662   assert(extras.length() == expected_handler_count,
2663          "unexpected handler addresses count %d", extras.length());
2664 #if INCLUDE_ZGC
2665   // retrieve addresses at which ZGC does colour patching
2666   if (add_relocs) {
2667     retrieve_reloc_addresses(start, end, extras);
2668   }
2669 #endif // INCLUDE_ZGC
2670   // record the stub entry and end plus the no_push entry and any
2671   // extra handler addresses
2672   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2673 
2674   return start;
2675 }
2676 
2677 
2678 // Arguments:
2679 //   entry     - location for return of (post-push) entry
2680 //   nooverlap_target - entry to branch to if no overlap detected
2681 //   is_oop  - true => oop array, so generate store check code
2682 //
2683 // Inputs:
2684 //   c_rarg0   - source array address
2685 //   c_rarg1   - destination array address
2686 //   c_rarg2   - element count, treated as ssize_t, can be zero
2687 //
2688 address StubGenerator::generate_conjoint_long_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2689   // aligned is always false -- x86_64 always uses the unaligned code
2690   const bool aligned = false;
2691   bool is_oop;
2692   bool dest_uninitialized;
2693   switch (stub_id) {
2694   case StubId::stubgen_jlong_arraycopy_id:
2695     is_oop = false;
2696     dest_uninitialized = false;
2697     break;
2698   case StubId::stubgen_oop_arraycopy_id:
2699     assert(!UseCompressedOops, "inconsistent oop copy size!");
2700     is_oop = true;
2701     dest_uninitialized = false;
2702     break;
2703   case StubId::stubgen_oop_arraycopy_uninit_id:
2704     assert(!UseCompressedOops, "inconsistent oop copy size!");
2705     is_oop = true;
2706     dest_uninitialized = true;
2707     break;
2708   default:
2709     ShouldNotReachHere();
2710   }
2711 
2712   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2713 #if COMPILER2_OR_JVMCI
2714   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2715     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2716   }
2717 #endif
2718   bool add_handlers = !is_oop && !aligned;
2719   bool add_relocs = UseZGC && is_oop;
2720   bool add_extras = add_handlers || add_relocs;
2721   GrowableArray<address> entries;
2722   GrowableArray<address> extras;
2723   int expected_entry_count = (entry != nullptr ? 2 : 1);
2724   int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2725   int entry_count = StubInfo::entry_count(stub_id);
2726   assert(entry_count == expected_entry_count, "sanity check");
2727   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2728   GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2729   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2730   if (start != nullptr) {
2731     assert(entries.length() == expected_entry_count - 1,
2732            "unexpected entry count %d", entries.length());
2733     assert(!add_handlers || extras.length() == expected_handler_count,
2734            "unexpected handler addresses count %d", extras.length());
2735     if (entry != nullptr) {
2736       *entry = entries.at(0);
2737     }
2738     if (add_handlers) {
2739       // restore 2 UMAM {start,end,handler} addresses from extras
2740       register_unsafe_access_handlers(extras, 0, 2);
2741     }
2742 #if INCLUDE_ZGC
2743     // register addresses at which ZGC does colour patching
2744     if (add_relocs)  {
2745       register_reloc_addresses(extras, 0, extras.length());
2746     }
2747 #endif // INCLUDE_ZGC
2748     return start;
2749   }
2750 
2751   __ align(CodeEntryAlignment);
2752   StubCodeMark mark(this, stub_id);
2753   start = __ pc();
2754 
2755   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2756   const Register from        = rdi;  // source array address
2757   const Register to          = rsi;  // destination array address
2758   const Register qword_count = rdx;  // elements count
2759   const Register saved_count = rcx;
2760 
2761   __ enter(); // required for proper stackwalking of RuntimeStub frame
2762   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2763 
2764   if (entry != nullptr) {
2765     *entry = __ pc();
2766     entries.append(*entry);
2767     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2768     BLOCK_COMMENT("Entry:");
2769   }
2770 
2771   array_overlap_test(nooverlap_target, Address::times_8);
2772   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2773                                  // r9 is used to save r15_thread
2774   // 'from', 'to' and 'qword_count' are now valid
2775 
2776   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2777   if (dest_uninitialized) {
2778     decorators |= IS_DEST_UNINITIALIZED;
2779   }
2780   if (aligned) {
2781     decorators |= ARRAYCOPY_ALIGNED;
2782   }
2783 
2784   BasicType type = is_oop ? T_OBJECT : T_LONG;
2785   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2786   {
2787     // UnsafeMemoryAccess page error: continue after unsafe access
2788     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2789 
2790     __ jmp(L_copy_bytes);
2791 
2792     // Copy trailing qwords
2793   __ BIND(L_copy_8_bytes);
2794     bs->copy_load_at(_masm, decorators, type, 8,
2795                      rax, Address(from, qword_count, Address::times_8, -8),
2796                      r10);
2797     bs->copy_store_at(_masm, decorators, type, 8,
2798                       Address(to, qword_count, Address::times_8, -8), rax,
2799                       r10);
2800     __ decrement(qword_count);
2801     __ jcc(Assembler::notZero, L_copy_8_bytes);
2802   }
2803   if (is_oop) {
2804     __ jmp(L_exit);
2805   } else {
2806     restore_arg_regs_using_thread();
2807     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2808     __ xorptr(rax, rax); // return 0
2809     __ vzeroupper();
2810     __ leave(); // required for proper stackwalking of RuntimeStub frame
2811     __ ret(0);
2812   }
2813   {
2814     // UnsafeMemoryAccess page error: continue after unsafe access
2815     UnsafeMemoryAccessMark umam(this, add_handlers, true);
2816 
2817     // Copy in multi-bytes chunks
2818     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2819   }
2820   __ BIND(L_exit);
2821   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2822   restore_arg_regs_using_thread();
2823   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2824                           SharedRuntime::_jlong_array_copy_ctr,
2825                  rscratch1); // Update counter after rscratch1 is free
2826   __ vzeroupper();
2827   __ xorptr(rax, rax); // return 0
2828   __ leave(); // required for proper stackwalking of RuntimeStub frame
2829   __ ret(0);
2830 
2831 
2832   // retrieve the registered handler addresses
2833   address end = __ pc();
2834   if (add_handlers) {
2835     retrieve_unsafe_access_handlers(start, end, extras);
2836   }
2837   assert(extras.length() == expected_handler_count,
2838          "unexpected handler addresses count %d", extras.length());
2839 #if INCLUDE_ZGC
2840   // retrieve addresses at which ZGC does colour patching
2841   if ((UseZGC && is_oop)) {
2842     retrieve_reloc_addresses(start, end, extras);
2843   }
2844 #endif // INCLUDE_ZGC
2845   // record the stub entry and end plus the no_push entry and any
2846   // extra handler addresses
2847   store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2848 
2849   return start;
2850 }
2851 
2852 
2853 // Helper for generating a dynamic type check.
2854 // Smashes no registers.
2855 void StubGenerator::generate_type_check(Register sub_klass,
2856                                         Register super_check_offset,
2857                                         Register super_klass,
2858                                         Label& L_success) {
2859   assert_different_registers(sub_klass, super_check_offset, super_klass);
2860 
2861   BLOCK_COMMENT("type_check:");
2862 
2863   Label L_miss;
2864 
2865   __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
2866                                    super_check_offset);
2867   __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2868 
2869   // Fall through on failure!
2870   __ BIND(L_miss);
2871 }
2872 
2873 //
2874 //  Generate checkcasting array copy stub
2875 //
2876 //  Input:
2877 //    c_rarg0   - source array address
2878 //    c_rarg1   - destination array address
2879 //    c_rarg2   - element count, treated as ssize_t, can be zero
2880 //    c_rarg3   - size_t ckoff (super_check_offset)
2881 // not Win64
2882 //    c_rarg4   - oop ckval (super_klass)
2883 // Win64
2884 //    rsp+40    - oop ckval (super_klass)
2885 //
2886 //  Output:
2887 //    rax ==  0  -  success
2888 //    rax == -1^K - failure, where K is partial transfer count
2889 //
2890 address StubGenerator::generate_checkcast_copy(StubId stub_id, address *entry) {
2891 
2892   bool dest_uninitialized;
2893   switch (stub_id) {
2894   case StubId::stubgen_checkcast_arraycopy_id:
2895     dest_uninitialized = false;
2896     break;
2897   case StubId::stubgen_checkcast_arraycopy_uninit_id:
2898     dest_uninitialized = true;
2899     break;
2900   default:
2901     ShouldNotReachHere();
2902   }
2903 
2904   GrowableArray<address> entries;
2905   GrowableArray<address> extras;
2906   int expected_entry_count = (entry != nullptr ? 2 : 1);
2907   int entry_count = StubInfo::entry_count(stub_id);
2908   assert(entry_count == expected_entry_count, "sanity check");
2909   GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2910   GrowableArray<address>* extras_ptr = (UseZGC ? &extras : nullptr);
2911   address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2912   if (start != nullptr) {
2913     assert(entries.length() == expected_entry_count - 1,
2914            "unexpected addresses count %d", entries.length());
2915     if (entry != nullptr) {
2916       *entry = entries.at(0);
2917     }
2918 #if INCLUDE_ZGC
2919     if (UseZGC)  {
2920       register_reloc_addresses(extras, 0, extras.length());
2921     }
2922 #endif // INCLUDE_ZGC
2923     return start;
2924   }
2925 
2926   Label L_load_element, L_store_element, L_do_card_marks, L_done;
2927 
2928   // Input registers (after setup_arg_regs)
2929   const Register from        = rdi;   // source array address
2930   const Register to          = rsi;   // destination array address
2931   const Register length      = rdx;   // elements count
2932   const Register ckoff       = rcx;   // super_check_offset
2933   const Register ckval       = r8;    // super_klass
2934 
2935   // Registers used as temps (r13, r14 are save-on-entry)
2936   const Register end_from    = from;  // source array end address
2937   const Register end_to      = r13;   // destination array end address
2938   const Register count       = rdx;   // -(count_remaining)
2939   const Register r14_length  = r14;   // saved copy of length
2940   // End pointers are inclusive, and if length is not zero they point
2941   // to the last unit copied:  end_to[0] := end_from[0]
2942 
2943   const Register rax_oop    = rax;    // actual oop copied
2944   const Register r11_klass  = r11;    // oop._klass
2945 
2946   //---------------------------------------------------------------
2947   // Assembler stub will be used for this call to arraycopy
2948   // if the two arrays are subtypes of Object[] but the
2949   // destination array type is not equal to or a supertype
2950   // of the source type.  Each element must be separately
2951   // checked.
2952 
2953   __ align(CodeEntryAlignment);
2954   StubCodeMark mark(this, stub_id);
2955   start = __ pc();
2956 
2957   __ enter(); // required for proper stackwalking of RuntimeStub frame
2958 
2959 #ifdef ASSERT
2960   // caller guarantees that the arrays really are different
2961   // otherwise, we would have to make conjoint checks
2962   { Label L;
2963     array_overlap_test(L, TIMES_OOP);
2964     __ stop("checkcast_copy within a single array");
2965     __ bind(L);
2966   }
2967 #endif //ASSERT
2968 
2969   setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2970                                   // ckoff => rcx, ckval => r8
2971                                   // r9 is used to save r15_thread
2972 #ifdef _WIN64
2973   // last argument (#4) is on stack on Win64
2974   __ movptr(ckval, Address(rsp, 6 * wordSize));
2975 #endif
2976 
2977   // Caller of this entry point must set up the argument registers.
2978   if (entry != nullptr) {
2979     *entry = __ pc();
2980     entries.append(*entry);
2981     BLOCK_COMMENT("Entry:");
2982   }
2983 
2984   // allocate spill slots for r13, r14
2985   enum {
2986     saved_r13_offset,
2987     saved_r14_offset,
2988     saved_r10_offset,
2989     saved_rbp_offset
2990   };
2991   __ subptr(rsp, saved_rbp_offset * wordSize);
2992   __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2993   __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2994   __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2995 
2996 #ifdef ASSERT
2997     Label L2;
2998     __ get_thread_slow(r14);
2999     __ cmpptr(r15_thread, r14);
3000     __ jcc(Assembler::equal, L2);
3001     __ stop("StubRoutines::call_stub: r15_thread is modified by call");
3002     __ bind(L2);
3003 #endif // ASSERT
3004 
3005   // check that int operands are properly extended to size_t
3006   assert_clean_int(length, rax);
3007   assert_clean_int(ckoff, rax);
3008 
3009 #ifdef ASSERT
3010   BLOCK_COMMENT("assert consistent ckoff/ckval");
3011   // The ckoff and ckval must be mutually consistent,
3012   // even though caller generates both.
3013   { Label L;
3014     int sco_offset = in_bytes(Klass::super_check_offset_offset());
3015     __ cmpl(ckoff, Address(ckval, sco_offset));
3016     __ jcc(Assembler::equal, L);
3017     __ stop("super_check_offset inconsistent");
3018     __ bind(L);
3019   }
3020 #endif //ASSERT
3021 
3022   // Loop-invariant addresses.  They are exclusive end pointers.
3023   Address end_from_addr(from, length, TIMES_OOP, 0);
3024   Address   end_to_addr(to,   length, TIMES_OOP, 0);
3025   // Loop-variant addresses.  They assume post-incremented count < 0.
3026   Address from_element_addr(end_from, count, TIMES_OOP, 0);
3027   Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
3028 
3029   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
3030   if (dest_uninitialized) {
3031     decorators |= IS_DEST_UNINITIALIZED;
3032   }
3033 
3034   BasicType type = T_OBJECT;
3035   size_t element_size = UseCompressedOops ? 4 : 8;
3036 
3037   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3038   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
3039 
3040   // Copy from low to high addresses, indexed from the end of each array.
3041   __ lea(end_from, end_from_addr);
3042   __ lea(end_to,   end_to_addr);
3043   __ movptr(r14_length, length);        // save a copy of the length
3044   assert(length == count, "");          // else fix next line:
3045   __ negptr(count);                     // negate and test the length
3046   __ jcc(Assembler::notZero, L_load_element);
3047 
3048   // Empty array:  Nothing to do.
3049   __ xorptr(rax, rax);                  // return 0 on (trivial) success
3050   __ jmp(L_done);
3051 
3052   // ======== begin loop ========
3053   // (Loop is rotated; its entry is L_load_element.)
3054   // Loop control:
3055   //   for (count = -count; count != 0; count++)
3056   // Base pointers src, dst are biased by 8*(count-1),to last element.
3057   __ align(OptoLoopAlignment);
3058 
3059   __ BIND(L_store_element);
3060   bs->copy_store_at(_masm,
3061                     decorators,
3062                     type,
3063                     element_size,
3064                     to_element_addr,
3065                     rax_oop,
3066                     r10);
3067   __ increment(count);               // increment the count toward zero
3068   __ jcc(Assembler::zero, L_do_card_marks);
3069 
3070   // ======== loop entry is here ========
3071   __ BIND(L_load_element);
3072   bs->copy_load_at(_masm,
3073                    decorators,
3074                    type,
3075                    element_size,
3076                    rax_oop,
3077                    from_element_addr,
3078                    r10);
3079   __ testptr(rax_oop, rax_oop);
3080   __ jcc(Assembler::zero, L_store_element);
3081 
3082   __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
3083   generate_type_check(r11_klass, ckoff, ckval, L_store_element);
3084   // ======== end loop ========
3085 
3086   // It was a real error; we must depend on the caller to finish the job.
3087   // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
3088   // Emit GC store barriers for the oops we have copied (r14 + rdx),
3089   // and report their number to the caller.
3090   assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
3091   Label L_post_barrier;
3092   __ addptr(r14_length, count);     // K = (original - remaining) oops
3093   __ movptr(rax, r14_length);       // save the value
3094   __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
3095   __ jccb(Assembler::notZero, L_post_barrier);
3096   __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
3097 
3098   // Come here on success only.
3099   __ BIND(L_do_card_marks);
3100   __ xorptr(rax, rax);              // return 0 on success
3101 
3102   __ BIND(L_post_barrier);
3103   bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
3104 
3105   // Common exit point (success or failure).
3106   __ BIND(L_done);
3107   __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
3108   __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
3109   __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
3110   restore_arg_regs_using_thread();
3111   INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
3112   __ leave(); // required for proper stackwalking of RuntimeStub frame
3113   __ ret(0);
3114 
3115   address end = __ pc();
3116 #if INCLUDE_ZGC
3117   // retrieve addresses at which ZGC does colour patching
3118   if (UseZGC) {
3119     retrieve_reloc_addresses(start, end, extras);
3120   }
3121 #endif // INCLUDE_ZGC
3122   // record the stub entry and end plus the no_push entry
3123     store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
3124 
3125   return start;
3126 }
3127 
3128 
3129 //  Generate 'unsafe' array copy stub
3130 //  Though just as safe as the other stubs, it takes an unscaled
3131 //  size_t argument instead of an element count.
3132 //
3133 //  Input:
3134 //    c_rarg0   - source array address
3135 //    c_rarg1   - destination array address
3136 //    c_rarg2   - byte count, treated as ssize_t, can be zero
3137 //
3138 // Examines the alignment of the operands and dispatches
3139 // to a long, int, short, or byte copy loop.
3140 //
3141 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
3142                                             address int_copy_entry, address long_copy_entry) {
3143 
3144   StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
3145   int entry_count = StubInfo::entry_count(stub_id);
3146   assert(entry_count == 1, "sanity check");
3147   address start = load_archive_data(stub_id);
3148   if (start != nullptr) {
3149     return start;
3150   }
3151 
3152   Label L_long_aligned, L_int_aligned, L_short_aligned;
3153 
3154   // Input registers (before setup_arg_regs)
3155   const Register from        = c_rarg0;  // source array address
3156   const Register to          = c_rarg1;  // destination array address
3157   const Register size        = c_rarg2;  // byte count (size_t)
3158 
3159   // Register used as a temp
3160   const Register bits        = rax;      // test copy of low bits
3161 
3162   __ align(CodeEntryAlignment);
3163   StubCodeMark mark(this, stub_id);
3164   start = __ pc();
3165 
3166   __ enter(); // required for proper stackwalking of RuntimeStub frame
3167 
3168   // bump this on entry, not on exit:
3169   INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
3170 
3171   __ mov(bits, from);
3172   __ orptr(bits, to);
3173   __ orptr(bits, size);
3174 
3175   __ testb(bits, BytesPerLong-1);
3176   __ jccb(Assembler::zero, L_long_aligned);
3177 
3178   __ testb(bits, BytesPerInt-1);
3179   __ jccb(Assembler::zero, L_int_aligned);
3180 
3181   __ testb(bits, BytesPerShort-1);
3182   __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
3183 
3184   __ BIND(L_short_aligned);
3185   __ shrptr(size, LogBytesPerShort); // size => short_count
3186   __ jump(RuntimeAddress(short_copy_entry));
3187 
3188   __ BIND(L_int_aligned);
3189   __ shrptr(size, LogBytesPerInt); // size => int_count
3190   __ jump(RuntimeAddress(int_copy_entry));
3191 
3192   __ BIND(L_long_aligned);
3193   __ shrptr(size, LogBytesPerLong); // size => qword_count
3194   __ jump(RuntimeAddress(long_copy_entry));
3195 
3196   // record the stub entry and end plus
3197   store_archive_data(stub_id, start, __ pc());
3198 
3199   return start;
3200 }
3201 
3202 
3203 // Static enum for helper
3204 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
3205 // Helper for generate_unsafe_setmemory
3206 //
3207 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
3208 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
3209                                      Register size, Register wide_value,
3210                                      Register tmp, Label& L_exit,
3211                                      MacroAssembler *_masm) {
3212   Label L_Loop, L_Tail, L_TailLoop;
3213 
3214   int shiftval = 0;
3215   int incr = 0;
3216 
3217   switch (type) {
3218     case USM_SHORT:
3219       shiftval = 1;
3220       incr = 16;
3221       break;
3222     case USM_DWORD:
3223       shiftval = 2;
3224       incr = 32;
3225       break;
3226     case USM_QUADWORD:
3227       shiftval = 3;
3228       incr = 64;
3229       break;
3230   }
3231 
3232   // At this point, we know the lower bits of size are zero
3233   __ shrq(size, shiftval);
3234   // size now has number of X-byte chunks (2, 4 or 8)
3235 
3236   // Number of (8*X)-byte chunks into tmp
3237   __ movq(tmp, size);
3238   __ shrq(tmp, 3);
3239   __ jccb(Assembler::zero, L_Tail);
3240 
3241   __ BIND(L_Loop);
3242 
3243   // Unroll 8 stores
3244   for (int i = 0; i < 8; i++) {
3245     switch (type) {
3246       case USM_SHORT:
3247         __ movw(Address(dest, (2 * i)), wide_value);
3248         break;
3249       case USM_DWORD:
3250         __ movl(Address(dest, (4 * i)), wide_value);
3251         break;
3252       case USM_QUADWORD:
3253         __ movq(Address(dest, (8 * i)), wide_value);
3254         break;
3255     }
3256   }
3257   __ addq(dest, incr);
3258   __ decrementq(tmp);
3259   __ jccb(Assembler::notZero, L_Loop);
3260 
3261   __ BIND(L_Tail);
3262 
3263   // Find number of remaining X-byte chunks
3264   __ andq(size, 0x7);
3265 
3266   // If zero, then we're done
3267   __ jccb(Assembler::zero, L_exit);
3268 
3269   __ BIND(L_TailLoop);
3270 
3271     switch (type) {
3272       case USM_SHORT:
3273         __ movw(Address(dest, 0), wide_value);
3274         break;
3275       case USM_DWORD:
3276         __ movl(Address(dest, 0), wide_value);
3277         break;
3278       case USM_QUADWORD:
3279         __ movq(Address(dest, 0), wide_value);
3280         break;
3281     }
3282   __ addq(dest, incr >> 3);
3283   __ decrementq(size);
3284   __ jccb(Assembler::notZero, L_TailLoop);
3285 }
3286 
3287 //  Generate 'unsafe' set memory stub
3288 //  Though just as safe as the other stubs, it takes an unscaled
3289 //  size_t (# bytes) argument instead of an element count.
3290 //
3291 //  Input:
3292 //    c_rarg0   - destination array address
3293 //    c_rarg1   - byte count (size_t)
3294 //    c_rarg2   - byte value
3295 //
3296 // Examines the alignment of the operands and dispatches
3297 // to an int, short, or byte fill loop.
3298 //
3299 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) {
3300   StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
3301   int entry_count = StubInfo::entry_count(stub_id);
3302   assert(entry_count == 1, "sanity check");
3303   // we expect three set of extra unsafememory access handler entries
3304   GrowableArray<address> extras;
3305   int expected_handler_count = 3 * UnsafeMemoryAccess::COLUMN_COUNT;
3306   address start = load_archive_data(stub_id, nullptr, &extras);
3307   if (start != nullptr) {
3308     assert(extras.length() == expected_handler_count,
3309            "unexpected handler addresses count %d", extras.length());
3310     register_unsafe_access_handlers(extras, 0, 3);
3311     return start;
3312   }
3313 
3314   __ align(CodeEntryAlignment);
3315   StubCodeMark mark(this, stub_id);
3316   start = __ pc();
3317   __ enter();   // required for proper stackwalking of RuntimeStub frame
3318 
3319   assert(unsafe_byte_fill != nullptr, "Invalid call");
3320 
3321   // bump this on entry, not on exit:
3322   INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
3323 
3324   {
3325     Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
3326 
3327     const Register dest = c_rarg0;
3328     const Register size = c_rarg1;
3329     const Register byteVal = c_rarg2;
3330     const Register wide_value = rax;
3331     const Register rScratch1 = r10;
3332 
3333     assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
3334 
3335     //     fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
3336 
3337     __ testq(size, size);
3338     __ jcc(Assembler::zero, L_exit);
3339 
3340     // Propagate byte to full Register
3341     __ movzbl(rScratch1, byteVal);
3342     __ mov64(wide_value, 0x0101010101010101ULL);
3343     __ imulq(wide_value, rScratch1);
3344 
3345     // Check for pointer & size alignment
3346     __ movq(rScratch1, dest);
3347     __ orq(rScratch1, size);
3348 
3349     __ testb(rScratch1, 7);
3350     __ jcc(Assembler::equal, L_fillQuadwords);
3351 
3352     __ testb(rScratch1, 3);
3353     __ jcc(Assembler::equal, L_fillDwords);
3354 
3355     __ testb(rScratch1, 1);
3356     __ jcc(Assembler::notEqual, L_fillBytes);
3357 
3358     // Fill words
3359     {
3360       UnsafeMemoryAccessMark umam(this, true, true);
3361 
3362       // At this point, we know the lower bit of size is zero and a
3363       // multiple of 2
3364       do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
3365                                L_exit, _masm);
3366     }
3367     __ jmpb(L_exit);
3368 
3369     __ BIND(L_fillQuadwords);
3370 
3371     // Fill QUADWORDs
3372     {
3373       UnsafeMemoryAccessMark umam(this, true, true);
3374 
3375       // At this point, we know the lower 3 bits of size are zero and a
3376       // multiple of 8
3377       do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
3378                                L_exit, _masm);
3379     }
3380     __ BIND(L_exit);
3381 
3382     __ leave();   // required for proper stackwalking of RuntimeStub frame
3383     __ ret(0);
3384 
3385     __ BIND(L_fillDwords);
3386 
3387     // Fill DWORDs
3388     {
3389       UnsafeMemoryAccessMark umam(this, true, true);
3390 
3391       // At this point, we know the lower 2 bits of size are zero and a
3392       // multiple of 4
3393       do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
3394                                L_exit, _masm);
3395     }
3396     __ jmpb(L_exit);
3397 
3398     __ BIND(L_fillBytes);
3399     // Set up for tail call to previously generated byte fill routine
3400     // Parameter order is (ptr, byteVal, size)
3401     __ xchgq(c_rarg1, c_rarg2);
3402     __ leave();    // Clear effect of enter()
3403     __ jump(RuntimeAddress(unsafe_byte_fill));
3404   }
3405 
3406   // retrieve the registered handler addresses
3407   address end = __ pc();
3408   retrieve_unsafe_access_handlers(start, end, extras);
3409   assert(extras.length() == expected_handler_count,
3410          "unexpected handler addresses count %d", extras.length());
3411 
3412   // record the stub entry and end plus the no_push entry and any
3413   // extra handler addresses
3414   store_archive_data(stub_id, start, end, nullptr, &extras);
3415 
3416   return start;
3417 }
3418 
3419 // Perform range checks on the proposed arraycopy.
3420 // Kills temp, but nothing else.
3421 // Also, clean the sign bits of src_pos and dst_pos.
3422 void StubGenerator::arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
3423                                            Register src_pos, // source position (c_rarg1)
3424                                            Register dst,     // destination array oo (c_rarg2)
3425                                            Register dst_pos, // destination position (c_rarg3)
3426                                            Register length,
3427                                            Register temp,
3428                                            Label& L_failed) {
3429   BLOCK_COMMENT("arraycopy_range_checks:");
3430 
3431   //  if (src_pos + length > arrayOop(src)->length())  FAIL;
3432   __ movl(temp, length);
3433   __ addl(temp, src_pos);             // src_pos + length
3434   __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
3435   __ jcc(Assembler::above, L_failed);
3436 
3437   //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
3438   __ movl(temp, length);
3439   __ addl(temp, dst_pos);             // dst_pos + length
3440   __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
3441   __ jcc(Assembler::above, L_failed);
3442 
3443   // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
3444   // Move with sign extension can be used since they are positive.
3445   __ movslq(src_pos, src_pos);
3446   __ movslq(dst_pos, dst_pos);
3447 
3448   BLOCK_COMMENT("arraycopy_range_checks done");
3449 }
3450 
3451 
3452 //  Generate generic array copy stubs
3453 //
3454 //  Input:
3455 //    c_rarg0    -  src oop
3456 //    c_rarg1    -  src_pos (32-bits)
3457 //    c_rarg2    -  dst oop
3458 //    c_rarg3    -  dst_pos (32-bits)
3459 // not Win64
3460 //    c_rarg4    -  element count (32-bits)
3461 // Win64
3462 //    rsp+40     -  element count (32-bits)
3463 //
3464 //  Output:
3465 //    rax ==  0  -  success
3466 //    rax == -1^K - failure, where K is partial transfer count
3467 //
3468 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry,
3469                                              address int_copy_entry, address oop_copy_entry,
3470                                              address long_copy_entry, address checkcast_copy_entry) {
3471 
3472   StubId stub_id = StubId::stubgen_generic_arraycopy_id;
3473   int entry_count = StubInfo::entry_count(stub_id);
3474   assert(entry_count == 1, "sanity check");
3475   address start = load_archive_data(stub_id);
3476   if (start != nullptr) {
3477     return start;
3478   }
3479 
3480   Label L_failed, L_failed_0, L_skip_failed_0, L_objArray;
3481   Label L_copy_shorts, L_copy_ints, L_copy_longs;
3482 
3483   // Input registers
3484   const Register src        = c_rarg0;  // source array oop
3485   const Register src_pos    = c_rarg1;  // source position
3486   const Register dst        = c_rarg2;  // destination array oop
3487   const Register dst_pos    = c_rarg3;  // destination position
3488 #ifndef _WIN64
3489   const Register length     = c_rarg4;
3490   const Register rklass_tmp = r9;  // load_klass
3491 #else
3492   const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
3493   const Register rklass_tmp = rdi;  // load_klass
3494 #endif
3495 
3496   StubCodeMark mark(this, stub_id);
3497   __ align(CodeEntryAlignment);
3498   start = __ pc();
3499 
3500   __ enter(); // required for proper stackwalking of RuntimeStub frame
3501 
3502 #ifdef _WIN64
3503   __ push_ppx(rklass_tmp); // rdi is callee-save on Windows
3504 #endif
3505 
3506   // bump this on entry, not on exit:
3507   INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
3508 
3509   //-----------------------------------------------------------------------
3510   // Assembler stub will be used for this call to arraycopy
3511   // if the following conditions are met:
3512   //
3513   // (1) src and dst must not be null.
3514   // (2) src_pos must not be negative.
3515   // (3) dst_pos must not be negative.
3516   // (4) length  must not be negative.
3517   // (5) src klass and dst klass should be the same and not null.
3518   // (6) src and dst should be arrays.
3519   // (7) src_pos + length must not exceed length of src.
3520   // (8) dst_pos + length must not exceed length of dst.
3521   //
3522 
3523   //  if (src == nullptr) return -1;
3524   __ testptr(src, src);         // src oop
3525   size_t j1off = __ offset();
3526   __ jccb(Assembler::zero, L_failed_0);
3527 
3528   //  if (src_pos < 0) return -1;
3529   __ testl(src_pos, src_pos); // src_pos (32-bits)
3530   __ jccb(Assembler::negative, L_failed_0);
3531 
3532   //  if (dst == nullptr) return -1;
3533   __ testptr(dst, dst);         // dst oop
3534   __ jccb(Assembler::zero, L_failed_0);
3535 
3536   //  if (dst_pos < 0) return -1;
3537   __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3538   size_t j4off = __ offset();
3539   // skip over the failure trampoline
3540   __ jccb(Assembler::positive, L_skip_failed_0);
3541 
3542   // The first four tests are very dense code,
3543   // but not quite dense enough to put four
3544   // jumps in a 16-byte instruction fetch buffer.
3545   // That's good, because some branch predicters
3546   // do not like jumps so close together.
3547   // Make sure of this.
3548   guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3549 
3550   // Short-hop target to L_failed.  Makes for denser prologue code.
3551   __ BIND(L_failed_0);
3552   __ jmp(L_failed);
3553 
3554   // continue here if first 4 checks pass
3555   __ bind(L_skip_failed_0);
3556 
3557   // registers used as temp
3558   const Register r11_length    = r11; // elements count to copy
3559   const Register r10_src_klass = r10; // array klass
3560 
3561   //  if (length < 0) return -1;
3562   __ movl(r11_length, length);        // length (elements count, 32-bits value)
3563   __ testl(r11_length, r11_length);
3564   __ jccb(Assembler::negative, L_failed_0);
3565 
3566   __ load_klass(r10_src_klass, src, rklass_tmp);
3567 #ifdef ASSERT
3568   //  assert(src->klass() != nullptr);
3569   {
3570     BLOCK_COMMENT("assert klasses not null {");
3571     Label L1, L2;
3572     __ testptr(r10_src_klass, r10_src_klass);
3573     __ jcc(Assembler::notZero, L2);   // it is broken if klass is null
3574     __ bind(L1);
3575     __ stop("broken null klass");
3576     __ bind(L2);
3577     __ load_klass(rax, dst, rklass_tmp);
3578     __ cmpq(rax, 0);
3579     __ jcc(Assembler::equal, L1);     // this would be broken also
3580     BLOCK_COMMENT("} assert klasses not null done");
3581   }
3582 #endif
3583 
3584   // Load layout helper (32-bits)
3585   //
3586   //  |array_tag|     | header_size | element_type |     |log2_element_size|
3587   // 32        30    24            16              8     2                 0
3588   //
3589   //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3590   //
3591 
3592   const int lh_offset = in_bytes(Klass::layout_helper_offset());
3593 
3594   // Handle objArrays completely differently...
3595   const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3596   __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3597   __ jcc(Assembler::equal, L_objArray);
3598 
3599   //  if (src->klass() != dst->klass()) return -1;
3600   __ load_klass(rax, dst, rklass_tmp);
3601   __ cmpq(r10_src_klass, rax);
3602   __ jcc(Assembler::notEqual, L_failed);
3603 
3604   // Check for flat inline type array -> return -1
3605   __ test_flat_array_oop(src, rax, L_failed);
3606 
3607   // Check for null-free (non-flat) inline type array -> handle as object array
3608   __ test_null_free_array_oop(src, rax, L_objArray);
3609 
3610   const Register rax_lh = rax;  // layout helper
3611   __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3612 
3613   // Check for flat inline type array -> return -1
3614   __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace);
3615   __ jcc(Assembler::notZero, L_failed);
3616 
3617   //  if (!src->is_Array()) return -1;
3618   __ cmpl(rax_lh, Klass::_lh_neutral_value);
3619   __ jcc(Assembler::greaterEqual, L_failed);
3620 
3621   // At this point, it is known to be a typeArray (array_tag 0x3).
3622 #ifdef ASSERT
3623   {
3624     BLOCK_COMMENT("assert primitive array {");
3625     Label L;
3626     __ movl(rklass_tmp, rax_lh);
3627     __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
3628     __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
3629     __ jcc(Assembler::equal, L);
3630     __ stop("must be a primitive array");
3631     __ bind(L);
3632     BLOCK_COMMENT("} assert primitive array done");
3633   }
3634 #endif
3635 
3636   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3637                          r10, L_failed);
3638 
3639   // TypeArrayKlass
3640   //
3641   // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3642   // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3643   //
3644 
3645   const Register r10_offset = r10;    // array offset
3646   const Register rax_elsize = rax_lh; // element size
3647 
3648   __ movl(r10_offset, rax_lh);
3649   __ shrl(r10_offset, Klass::_lh_header_size_shift);
3650   __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3651   __ addptr(src, r10_offset);           // src array offset
3652   __ addptr(dst, r10_offset);           // dst array offset
3653   BLOCK_COMMENT("choose copy loop based on element size");
3654   __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3655 
3656 #ifdef _WIN64
3657   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3658 #endif
3659 
3660   // next registers should be set before the jump to corresponding stub
3661   const Register from     = c_rarg0;  // source array address
3662   const Register to       = c_rarg1;  // destination array address
3663   const Register count    = c_rarg2;  // elements count
3664 
3665   // 'from', 'to', 'count' registers should be set in such order
3666   // since they are the same as 'src', 'src_pos', 'dst'.
3667 
3668   __ cmpl(rax_elsize, 0);
3669   __ jccb(Assembler::notEqual, L_copy_shorts);
3670   __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3671   __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3672   __ movl2ptr(count, r11_length); // length
3673   __ jump(RuntimeAddress(byte_copy_entry));
3674 
3675 __ BIND(L_copy_shorts);
3676   __ cmpl(rax_elsize, LogBytesPerShort);
3677   __ jccb(Assembler::notEqual, L_copy_ints);
3678   __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3679   __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3680   __ movl2ptr(count, r11_length); // length
3681   __ jump(RuntimeAddress(short_copy_entry));
3682 
3683 __ BIND(L_copy_ints);
3684   __ cmpl(rax_elsize, LogBytesPerInt);
3685   __ jccb(Assembler::notEqual, L_copy_longs);
3686   __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3687   __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3688   __ movl2ptr(count, r11_length); // length
3689   __ jump(RuntimeAddress(int_copy_entry));
3690 
3691 __ BIND(L_copy_longs);
3692 #ifdef ASSERT
3693   {
3694     BLOCK_COMMENT("assert long copy {");
3695     Label L;
3696     __ cmpl(rax_elsize, LogBytesPerLong);
3697     __ jcc(Assembler::equal, L);
3698     __ stop("must be long copy, but elsize is wrong");
3699     __ bind(L);
3700     BLOCK_COMMENT("} assert long copy done");
3701   }
3702 #endif
3703   __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3704   __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3705   __ movl2ptr(count, r11_length); // length
3706   __ jump(RuntimeAddress(long_copy_entry));
3707 
3708   // ObjArrayKlass
3709 __ BIND(L_objArray);
3710   // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3711 
3712   Label L_plain_copy, L_checkcast_copy;
3713   //  test array classes for subtyping
3714   __ load_klass(rax, dst, rklass_tmp);
3715   __ cmpq(r10_src_klass, rax); // usual case is exact equality
3716   __ jcc(Assembler::notEqual, L_checkcast_copy);
3717 
3718   // Identically typed arrays can be copied without element-wise checks.
3719   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3720                          r10, L_failed);
3721 
3722   __ lea(from, Address(src, src_pos, TIMES_OOP,
3723                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3724   __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3725                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3726   __ movl2ptr(count, r11_length); // length
3727 __ BIND(L_plain_copy);
3728 #ifdef _WIN64
3729   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3730 #endif
3731   __ jump(RuntimeAddress(oop_copy_entry));
3732 
3733 __ BIND(L_checkcast_copy);
3734   // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3735   {
3736     // Before looking at dst.length, make sure dst is also an objArray.
3737     // This check also fails for flat arrays which are not supported.
3738     __ cmpl(Address(rax, lh_offset), objArray_lh);
3739     __ jcc(Assembler::notEqual, L_failed);
3740 
3741 #ifdef ASSERT
3742     {
3743       BLOCK_COMMENT("assert not null-free array {");
3744       Label L;
3745       __ test_non_null_free_array_oop(dst, rklass_tmp, L);
3746       __ stop("unexpected null-free array");
3747       __ bind(L);
3748       BLOCK_COMMENT("} assert not null-free array");
3749     }
3750 #endif
3751 
3752     // It is safe to examine both src.length and dst.length.
3753     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3754                            rax, L_failed);
3755 
3756     const Register r11_dst_klass = r11;
3757     __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3758 
3759     // Marshal the base address arguments now, freeing registers.
3760     __ lea(from, Address(src, src_pos, TIMES_OOP,
3761                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3762     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3763                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3764     __ movl(count, length);           // length (reloaded)
3765     Register sco_temp = c_rarg3;      // this register is free now
3766     assert_different_registers(from, to, count, sco_temp,
3767                                r11_dst_klass, r10_src_klass);
3768     assert_clean_int(count, sco_temp);
3769 
3770     // Generate the type check.
3771     const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3772     __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3773     assert_clean_int(sco_temp, rax);
3774     generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3775 
3776     // Fetch destination element klass from the ObjArrayKlass header.
3777     int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3778     __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3779     __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3780     assert_clean_int(sco_temp, rax);
3781 
3782 #ifdef _WIN64
3783     __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3784 #endif
3785 
3786     // the checkcast_copy loop needs two extra arguments:
3787     assert(c_rarg3 == sco_temp, "#3 already in place");
3788     // Set up arguments for checkcast_copy_entry.
3789     setup_arg_regs_using_thread(4);
3790     __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3791     __ jump(RuntimeAddress(checkcast_copy_entry));
3792   }
3793 
3794 __ BIND(L_failed);
3795 #ifdef _WIN64
3796   __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3797 #endif
3798   __ xorptr(rax, rax);
3799   __ notptr(rax); // return -1
3800   __ leave();   // required for proper stackwalking of RuntimeStub frame
3801   __ ret(0);
3802 
3803   // record the stub entry and end
3804   store_archive_data(stub_id, start, __ pc());
3805 
3806   return start;
3807 }
3808 
3809 #undef __