1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/macroAssembler.hpp"
  26 #include "gc/shared/barrierSet.hpp"
  27 #include "gc/shared/barrierSetAssembler.hpp"
  28 #include "oops/objArrayKlass.hpp"
  29 #include "runtime/sharedRuntime.hpp"
  30 #include "runtime/stubRoutines.hpp"
  31 #include "stubGenerator_x86_64.hpp"
  32 #ifdef COMPILER2
  33 #include "opto/c2_globals.hpp"
  34 #endif
  35 #if INCLUDE_JVMCI
  36 #include "jvmci/jvmci_globals.hpp"
  37 #endif
  38 
  39 #define __ _masm->
  40 
  41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #else
  46 #define BLOCK_COMMENT(str) __ block_comment(str)
  47 #endif // PRODUCT
  48 
  49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  50 
  51 #ifdef PRODUCT
  52 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
  53 #else
  54 #define INC_COUNTER_NP(counter, rscratch) \
  55 BLOCK_COMMENT("inc_counter " #counter); \
  56 inc_counter_np(_masm, counter, rscratch);
  57 
  58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
  59   __ incrementl(ExternalAddress((address)&counter), rscratch);
  60 }
  61 
  62 #if COMPILER2_OR_JVMCI
  63 static uint& get_profile_ctr(int shift) {
  64   if (shift == 0) {
  65     return SharedRuntime::_jbyte_array_copy_ctr;
  66   } else if (shift == 1) {
  67     return SharedRuntime::_jshort_array_copy_ctr;
  68   } else if (shift == 2) {
  69     return SharedRuntime::_jint_array_copy_ctr;
  70   } else {
  71     assert(shift == 3, "");
  72     return SharedRuntime::_jlong_array_copy_ctr;
  73   }
  74 }
  75 #endif // COMPILER2_OR_JVMCI
  76 #endif // !PRODUCT
  77 
  78 void StubGenerator::generate_arraycopy_stubs() {
  79   address entry;
  80   address entry_jbyte_arraycopy;
  81   address entry_jshort_arraycopy;
  82   address entry_jint_arraycopy;
  83   address entry_oop_arraycopy;
  84   address entry_jlong_arraycopy;
  85   address entry_checkcast_arraycopy;
  86 
  87   StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(&entry);
  88   StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(entry, &entry_jbyte_arraycopy);
  89 
  90   StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&entry);
  91   StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(entry, &entry_jshort_arraycopy);
  92 
  93   StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
  94   StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
  95 
  96   StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubGenStubId::jlong_disjoint_arraycopy_id, &entry);
  97   StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(StubGenStubId::jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
  98   if (UseCompressedOops) {
  99     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(StubGenStubId::oop_disjoint_arraycopy_id, &entry);
 100     StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(StubGenStubId::oop_arraycopy_id, entry, &entry_oop_arraycopy);
 101     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(StubGenStubId::oop_disjoint_arraycopy_uninit_id, &entry);
 102     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(StubGenStubId::oop_arraycopy_uninit_id, entry, nullptr);
 103   } else {
 104     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubGenStubId::oop_disjoint_arraycopy_id, &entry);
 105     StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(StubGenStubId::oop_arraycopy_id, entry, &entry_oop_arraycopy);
 106     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(StubGenStubId::oop_disjoint_arraycopy_uninit_id, &entry);
 107     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(StubGenStubId::oop_arraycopy_uninit_id, entry, nullptr);
 108   }
 109 
 110   StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 111   StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 112 
 113   StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 114                                                             entry_jshort_arraycopy,
 115                                                             entry_jint_arraycopy,
 116                                                             entry_jlong_arraycopy);
 117   StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 118                                                              entry_jshort_arraycopy,
 119                                                              entry_jint_arraycopy,
 120                                                              entry_oop_arraycopy,
 121                                                              entry_jlong_arraycopy,
 122                                                              entry_checkcast_arraycopy);
 123 
 124   StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 125   StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 126   StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 127   StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 128   StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 129   StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 130 
 131   StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
 132 
 133   // We don't generate specialized code for HeapWord-aligned source
 134   // arrays, so just use the code we've already generated
 135   StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
 136   StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
 137 
 138   StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
 139   StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
 140 
 141   StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
 142   StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
 143 
 144   StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
 145   StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
 146 
 147   StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
 148   StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
 149 
 150   StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
 151   StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
 152 }
 153 
 154 
 155 // Verify that a register contains clean 32-bits positive value
 156 // (high 32-bits are 0) so it could be used in 64-bits shifts.
 157 //
 158 //  Input:
 159 //    Rint  -  32-bits value
 160 //    Rtmp  -  scratch
 161 //
 162 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
 163 #ifdef ASSERT
 164   Label L;
 165   assert_different_registers(Rtmp, Rint);
 166   __ movslq(Rtmp, Rint);
 167   __ cmpq(Rtmp, Rint);
 168   __ jcc(Assembler::equal, L);
 169   __ stop("high 32-bits of int value are not 0");
 170   __ bind(L);
 171 #endif
 172 }
 173 
 174 
 175 //  Generate overlap test for array copy stubs
 176 //
 177 //  Input:
 178 //     c_rarg0 - from
 179 //     c_rarg1 - to
 180 //     c_rarg2 - element count
 181 //
 182 //  Output:
 183 //     rax   - &from[element count - 1]
 184 //
 185 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
 186   const Register from     = c_rarg0;
 187   const Register to       = c_rarg1;
 188   const Register count    = c_rarg2;
 189   const Register end_from = rax;
 190 
 191   __ cmpptr(to, from);
 192   __ lea(end_from, Address(from, count, sf, 0));
 193   if (NOLp == nullptr) {
 194     RuntimeAddress no_overlap(no_overlap_target);
 195     __ jump_cc(Assembler::belowEqual, no_overlap);
 196     __ cmpptr(to, end_from);
 197     __ jump_cc(Assembler::aboveEqual, no_overlap);
 198   } else {
 199     __ jcc(Assembler::belowEqual, (*NOLp));
 200     __ cmpptr(to, end_from);
 201     __ jcc(Assembler::aboveEqual, (*NOLp));
 202   }
 203 }
 204 
 205 
 206 // Copy big chunks forward
 207 //
 208 // Inputs:
 209 //   end_from     - source arrays end address
 210 //   end_to       - destination array end address
 211 //   qword_count  - 64-bits element count, negative
 212 //   tmp1         - scratch
 213 //   L_copy_bytes - entry label
 214 //   L_copy_8_bytes  - exit  label
 215 //
 216 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
 217                                        Register qword_count, Register tmp1,
 218                                        Register tmp2, Label& L_copy_bytes,
 219                                        Label& L_copy_8_bytes, DecoratorSet decorators,
 220                                        BasicType type) {
 221   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 222   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 223   Label L_loop;
 224   __ align(OptoLoopAlignment);
 225   if (UseUnalignedLoadStores) {
 226     Label L_end;
 227     __ BIND(L_loop);
 228     if (UseAVX >= 2) {
 229       bs->copy_load_at(_masm, decorators, type, 32,
 230                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 231                        tmp1, xmm1);
 232       bs->copy_store_at(_masm, decorators, type, 32,
 233                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 234                         tmp1, tmp2, xmm1);
 235 
 236       bs->copy_load_at(_masm, decorators, type, 32,
 237                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 238                        tmp1, xmm1);
 239       bs->copy_store_at(_masm, decorators, type, 32,
 240                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 241                         tmp1, tmp2, xmm1);
 242     } else {
 243       bs->copy_load_at(_masm, decorators, type, 16,
 244                        xmm0, Address(end_from, qword_count, Address::times_8, -56),
 245                        tmp1, xmm1);
 246       bs->copy_store_at(_masm, decorators, type, 16,
 247                         Address(end_to, qword_count, Address::times_8, -56), xmm0,
 248                         tmp1, tmp2, xmm1);
 249       bs->copy_load_at(_masm, decorators, type, 16,
 250                        xmm0, Address(end_from, qword_count, Address::times_8, -40),
 251                        tmp1, xmm1);
 252       bs->copy_store_at(_masm, decorators, type, 16,
 253                         Address(end_to, qword_count, Address::times_8, -40), xmm0,
 254                         tmp1, tmp2, xmm1);
 255       bs->copy_load_at(_masm, decorators, type, 16,
 256                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 257                        tmp1, xmm1);
 258       bs->copy_store_at(_masm, decorators, type, 16,
 259                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 260                         tmp1, tmp2, xmm1);
 261       bs->copy_load_at(_masm, decorators, type, 16,
 262                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 263                        tmp1, xmm1);
 264       bs->copy_store_at(_masm, decorators, type, 16,
 265                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 266                         tmp1, tmp2, xmm1);
 267     }
 268 
 269     __ BIND(L_copy_bytes);
 270     __ addptr(qword_count, 8);
 271     __ jcc(Assembler::lessEqual, L_loop);
 272     __ subptr(qword_count, 4);  // sub(8) and add(4)
 273     __ jcc(Assembler::greater, L_end);
 274     // Copy trailing 32 bytes
 275     if (UseAVX >= 2) {
 276       bs->copy_load_at(_masm, decorators, type, 32,
 277                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 278                        tmp1, xmm1);
 279       bs->copy_store_at(_masm, decorators, type, 32,
 280                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 281                         tmp1, tmp2, xmm1);
 282     } else {
 283       bs->copy_load_at(_masm, decorators, type, 16,
 284                        xmm0, Address(end_from, qword_count, Address::times_8, -24),
 285                        tmp1, xmm1);
 286       bs->copy_store_at(_masm, decorators, type, 16,
 287                         Address(end_to, qword_count, Address::times_8, -24), xmm0,
 288                         tmp1, tmp2, xmm1);
 289       bs->copy_load_at(_masm, decorators, type, 16,
 290                        xmm0, Address(end_from, qword_count, Address::times_8, -8),
 291                        tmp1, xmm1);
 292       bs->copy_store_at(_masm, decorators, type, 16,
 293                         Address(end_to, qword_count, Address::times_8, -8), xmm0,
 294                         tmp1, tmp2, xmm1);
 295     }
 296     __ addptr(qword_count, 4);
 297     __ BIND(L_end);
 298   } else {
 299     // Copy 32-bytes per iteration
 300     __ BIND(L_loop);
 301     bs->copy_load_at(_masm, decorators, type, 8,
 302                      tmp1, Address(end_from, qword_count, Address::times_8, -24),
 303                      tmp2);
 304     bs->copy_store_at(_masm, decorators, type, 8,
 305                       Address(end_to, qword_count, Address::times_8, -24), tmp1,
 306                       tmp2);
 307     bs->copy_load_at(_masm, decorators, type, 8,
 308                      tmp1, Address(end_from, qword_count, Address::times_8, -16),
 309                      tmp2);
 310     bs->copy_store_at(_masm, decorators, type, 8,
 311                       Address(end_to, qword_count, Address::times_8, -16), tmp1,
 312                       tmp2);
 313     bs->copy_load_at(_masm, decorators, type, 8,
 314                      tmp1, Address(end_from, qword_count, Address::times_8, -8),
 315                      tmp2);
 316     bs->copy_store_at(_masm, decorators, type, 8,
 317                       Address(end_to, qword_count, Address::times_8, -8), tmp1,
 318                       tmp2);
 319     bs->copy_load_at(_masm, decorators, type, 8,
 320                      tmp1, Address(end_from, qword_count, Address::times_8, 0),
 321                      tmp2);
 322     bs->copy_store_at(_masm, decorators, type, 8,
 323                       Address(end_to, qword_count, Address::times_8, 0), tmp1,
 324                       tmp2);
 325 
 326     __ BIND(L_copy_bytes);
 327     __ addptr(qword_count, 4);
 328     __ jcc(Assembler::lessEqual, L_loop);
 329   }
 330   __ subptr(qword_count, 4);
 331   __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
 332 }
 333 
 334 
 335 // Copy big chunks backward
 336 //
 337 // Inputs:
 338 //   from         - source arrays address
 339 //   dest         - destination array address
 340 //   qword_count  - 64-bits element count
 341 //   tmp1         - scratch
 342 //   L_copy_bytes - entry label
 343 //   L_copy_8_bytes  - exit  label
 344 //
 345 void StubGenerator::copy_bytes_backward(Register from, Register dest,
 346                                         Register qword_count, Register tmp1,
 347                                         Register tmp2, Label& L_copy_bytes,
 348                                         Label& L_copy_8_bytes, DecoratorSet decorators,
 349                                         BasicType type) {
 350   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 351   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 352   Label L_loop;
 353   __ align(OptoLoopAlignment);
 354   if (UseUnalignedLoadStores) {
 355     Label L_end;
 356     __ BIND(L_loop);
 357     if (UseAVX >= 2) {
 358       bs->copy_load_at(_masm, decorators, type, 32,
 359                        xmm0, Address(from, qword_count, Address::times_8, 32),
 360                        tmp1, xmm1);
 361       bs->copy_store_at(_masm, decorators, type, 32,
 362                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 363                         tmp1, tmp2, xmm1);
 364       bs->copy_load_at(_masm, decorators, type, 32,
 365                        xmm0, Address(from, qword_count, Address::times_8, 0),
 366                        tmp1, xmm1);
 367       bs->copy_store_at(_masm, decorators, type, 32,
 368                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 369                         tmp1, tmp2, xmm1);
 370     } else {
 371       bs->copy_load_at(_masm, decorators, type, 16,
 372                        xmm0, Address(from, qword_count, Address::times_8, 48),
 373                        tmp1, xmm1);
 374       bs->copy_store_at(_masm, decorators, type, 16,
 375                         Address(dest, qword_count, Address::times_8, 48), xmm0,
 376                         tmp1, tmp2, xmm1);
 377       bs->copy_load_at(_masm, decorators, type, 16,
 378                        xmm0, Address(from, qword_count, Address::times_8, 32),
 379                        tmp1, xmm1);
 380       bs->copy_store_at(_masm, decorators, type, 16,
 381                         Address(dest, qword_count, Address::times_8, 32), xmm0,
 382                         tmp1, tmp2, xmm1);
 383       bs->copy_load_at(_masm, decorators, type, 16,
 384                        xmm0, Address(from, qword_count, Address::times_8, 16),
 385                        tmp1, xmm1);
 386       bs->copy_store_at(_masm, decorators, type, 16,
 387                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 388                         tmp1, tmp2, xmm1);
 389       bs->copy_load_at(_masm, decorators, type, 16,
 390                        xmm0, Address(from, qword_count, Address::times_8, 0),
 391                        tmp1, xmm1);
 392       bs->copy_store_at(_masm, decorators, type, 16,
 393                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 394                         tmp1, tmp2, xmm1);
 395     }
 396 
 397     __ BIND(L_copy_bytes);
 398     __ subptr(qword_count, 8);
 399     __ jcc(Assembler::greaterEqual, L_loop);
 400 
 401     __ addptr(qword_count, 4);  // add(8) and sub(4)
 402     __ jcc(Assembler::less, L_end);
 403     // Copy trailing 32 bytes
 404     if (UseAVX >= 2) {
 405       bs->copy_load_at(_masm, decorators, type, 32,
 406                        xmm0, Address(from, qword_count, Address::times_8, 0),
 407                        tmp1, xmm1);
 408       bs->copy_store_at(_masm, decorators, type, 32,
 409                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 410                         tmp1, tmp2, xmm1);
 411     } else {
 412       bs->copy_load_at(_masm, decorators, type, 16,
 413                        xmm0, Address(from, qword_count, Address::times_8, 16),
 414                        tmp1, xmm1);
 415       bs->copy_store_at(_masm, decorators, type, 16,
 416                         Address(dest, qword_count, Address::times_8, 16), xmm0,
 417                         tmp1, tmp2, xmm1);
 418       bs->copy_load_at(_masm, decorators, type, 16,
 419                        xmm0, Address(from, qword_count, Address::times_8, 0),
 420                        tmp1, xmm1);
 421       bs->copy_store_at(_masm, decorators, type, 16,
 422                         Address(dest, qword_count, Address::times_8, 0), xmm0,
 423                         tmp1, tmp2, xmm1);
 424     }
 425     __ subptr(qword_count, 4);
 426     __ BIND(L_end);
 427   } else {
 428     // Copy 32-bytes per iteration
 429     __ BIND(L_loop);
 430     bs->copy_load_at(_masm, decorators, type, 8,
 431                      tmp1, Address(from, qword_count, Address::times_8, 24),
 432                      tmp2);
 433     bs->copy_store_at(_masm, decorators, type, 8,
 434                       Address(dest, qword_count, Address::times_8, 24), tmp1,
 435                       tmp2);
 436     bs->copy_load_at(_masm, decorators, type, 8,
 437                      tmp1, Address(from, qword_count, Address::times_8, 16),
 438                      tmp2);
 439     bs->copy_store_at(_masm, decorators, type, 8,
 440                       Address(dest, qword_count, Address::times_8, 16), tmp1,
 441                       tmp2);
 442     bs->copy_load_at(_masm, decorators, type, 8,
 443                      tmp1, Address(from, qword_count, Address::times_8, 8),
 444                      tmp2);
 445     bs->copy_store_at(_masm, decorators, type, 8,
 446                       Address(dest, qword_count, Address::times_8, 8), tmp1,
 447                       tmp2);
 448     bs->copy_load_at(_masm, decorators, type, 8,
 449                      tmp1, Address(from, qword_count, Address::times_8, 0),
 450                      tmp2);
 451     bs->copy_store_at(_masm, decorators, type, 8,
 452                       Address(dest, qword_count, Address::times_8, 0), tmp1,
 453                       tmp2);
 454 
 455     __ BIND(L_copy_bytes);
 456     __ subptr(qword_count, 4);
 457     __ jcc(Assembler::greaterEqual, L_loop);
 458   }
 459   __ addptr(qword_count, 4);
 460   __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
 461 }
 462 
 463 #if COMPILER2_OR_JVMCI
 464 
 465 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
 466 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 467 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 468 //   default configuration.
 469 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 470 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 471 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 472 //   better performance for disjoint copies. For conjoint/backward copy vector based
 473 //   copy performs better.
 474 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
 475 //   64 byte vector registers (ZMMs).
 476 
 477 // Inputs:
 478 //   c_rarg0   - source array address
 479 //   c_rarg1   - destination array address
 480 //   c_rarg2   - element count, treated as ssize_t, can be zero
 481 //
 482 //
 483 // Side Effects:
 484 //   disjoint_copy_avx3_masked is set to the no-overlap entry point
 485 //   used by generate_conjoint_[byte/int/short/long]_copy().
 486 //
 487 address StubGenerator::generate_disjoint_copy_avx3_masked(StubGenStubId stub_id, address* entry) {
 488   // aligned is always false -- x86_64 always uses the unaligned code
 489   const bool aligned = false;
 490   int shift;
 491   bool is_oop;
 492   bool dest_uninitialized;
 493 
 494   switch (stub_id) {
 495   case jbyte_disjoint_arraycopy_id:
 496     shift = 0;
 497     is_oop = false;
 498     dest_uninitialized = false;
 499     break;
 500   case jshort_disjoint_arraycopy_id:
 501     shift = 1;
 502     is_oop = false;
 503     dest_uninitialized = false;
 504     break;
 505   case jint_disjoint_arraycopy_id:
 506     shift = 2;
 507     is_oop = false;
 508     dest_uninitialized = false;
 509     break;
 510   case jlong_disjoint_arraycopy_id:
 511     shift = 3;
 512     is_oop = false;
 513     dest_uninitialized = false;
 514     break;
 515   case oop_disjoint_arraycopy_id:
 516     shift = (UseCompressedOops ? 2 : 3);
 517     is_oop = true;
 518     dest_uninitialized = false;
 519     break;
 520   case oop_disjoint_arraycopy_uninit_id:
 521     shift = (UseCompressedOops ? 2 : 3);
 522     is_oop = true;
 523     dest_uninitialized = true;
 524     break;
 525   default:
 526     ShouldNotReachHere();
 527   }
 528 
 529   __ align(CodeEntryAlignment);
 530   StubCodeMark mark(this, stub_id);
 531   address start = __ pc();
 532 
 533   int avx3threshold = VM_Version::avx3_threshold();
 534   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 535   const int large_threshold = 2621440; // 2.5 MB
 536   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 537   Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 538   Label L_copy_large, L_finish;
 539   const Register from        = rdi;  // source array address
 540   const Register to          = rsi;  // destination array address
 541   const Register count       = rdx;  // elements count
 542   const Register temp1       = r8;
 543   const Register temp2       = r11;
 544   const Register temp3       = rax;
 545   const Register temp4       = rcx;
 546   // End pointers are inclusive, and if count is not zero they point
 547   // to the last unit copied:  end_to[0] := end_from[0]
 548 
 549   __ enter(); // required for proper stackwalking of RuntimeStub frame
 550   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 551 
 552   if (entry != nullptr) {
 553     *entry = __ pc();
 554      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 555     BLOCK_COMMENT("Entry:");
 556   }
 557 
 558   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 559   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 560 
 561   setup_argument_regs(type);
 562 
 563   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 564   if (dest_uninitialized) {
 565     decorators |= IS_DEST_UNINITIALIZED;
 566   }
 567   if (aligned) {
 568     decorators |= ARRAYCOPY_ALIGNED;
 569   }
 570   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 571   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 572 
 573   {
 574     // Type(shift)           byte(0), short(1), int(2),   long(3)
 575     int loop_size[]        = { 192,     96,       48,      24};
 576     int threshold[]        = { 4096,    2048,     1024,    512};
 577 
 578     // UnsafeMemoryAccess page error: continue after unsafe access
 579     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 580     // 'from', 'to' and 'count' are now valid
 581 
 582     // temp1 holds remaining count and temp4 holds running count used to compute
 583     // next address offset for start of to/from addresses (temp4 * scale).
 584     __ mov64(temp4, 0);
 585     __ movq(temp1, count);
 586 
 587     // Zero length check.
 588     __ BIND(L_tail);
 589     __ cmpq(temp1, 0);
 590     __ jcc(Assembler::lessEqual, L_exit);
 591 
 592     // Special cases using 32 byte [masked] vector copy operations.
 593     arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 594                                  temp4, temp3, use64byteVector, L_entry, L_exit);
 595 
 596     // PRE-MAIN-POST loop for aligned copy.
 597     __ BIND(L_entry);
 598 
 599     if (MaxVectorSize == 64) {
 600       __ movq(temp2, temp1);
 601       __ shlq(temp2, shift);
 602       __ cmpq(temp2, large_threshold);
 603       __ jcc(Assembler::greaterEqual, L_copy_large);
 604     }
 605     if (avx3threshold != 0) {
 606       __ cmpq(count, threshold[shift]);
 607       if (MaxVectorSize == 64) {
 608         // Copy using 64 byte vectors.
 609         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 610       } else {
 611         assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
 612         // REP MOVS offer a faster copy path.
 613         __ jcc(Assembler::greaterEqual, L_repmovs);
 614       }
 615     }
 616 
 617     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 618       // Partial copy to make dst address 32 byte aligned.
 619       __ movq(temp2, to);
 620       __ andq(temp2, 31);
 621       __ jcc(Assembler::equal, L_main_pre_loop);
 622 
 623       __ negptr(temp2);
 624       __ addq(temp2, 32);
 625       if (shift) {
 626         __ shrq(temp2, shift);
 627       }
 628       __ movq(temp3, temp2);
 629       copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
 630       __ movq(temp4, temp2);
 631       __ movq(temp1, count);
 632       __ subq(temp1, temp2);
 633 
 634       __ cmpq(temp1, loop_size[shift]);
 635       __ jcc(Assembler::less, L_tail);
 636 
 637       __ BIND(L_main_pre_loop);
 638       __ subq(temp1, loop_size[shift]);
 639 
 640       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 641       __ align32();
 642       __ BIND(L_main_loop);
 643          copy64_avx(to, from, temp4, xmm1, false, shift, 0);
 644          copy64_avx(to, from, temp4, xmm1, false, shift, 64);
 645          copy64_avx(to, from, temp4, xmm1, false, shift, 128);
 646          __ addptr(temp4, loop_size[shift]);
 647          __ subq(temp1, loop_size[shift]);
 648          __ jcc(Assembler::greater, L_main_loop);
 649 
 650       __ addq(temp1, loop_size[shift]);
 651 
 652       // Tail loop.
 653       __ jmp(L_tail);
 654 
 655       __ BIND(L_repmovs);
 656         __ movq(temp2, temp1);
 657         // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
 658         __ movq(temp3, to);
 659         __ movq(to,  from);
 660         __ movq(from, temp3);
 661         // Save to/from for restoration post rep_mov.
 662         __ movq(temp1, to);
 663         __ movq(temp3, from);
 664         if(shift < 3) {
 665           __ shrq(temp2, 3-shift);     // quad word count
 666         }
 667         __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
 668         __ rep_mov();
 669         __ shlq(temp2, 3);             // convert quad words into byte count.
 670         if(shift) {
 671           __ shrq(temp2, shift);       // type specific count.
 672         }
 673         // Restore original addresses in to/from.
 674         __ movq(to, temp3);
 675         __ movq(from, temp1);
 676         __ movq(temp4, temp2);
 677         __ movq(temp1, count);
 678         __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
 679         __ jmp(L_tail);
 680     }
 681 
 682     if (MaxVectorSize > 32) {
 683       __ BIND(L_pre_main_post_64);
 684       // Partial copy to make dst address 64 byte aligned.
 685       __ movq(temp2, to);
 686       __ andq(temp2, 63);
 687       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 688 
 689       __ negptr(temp2);
 690       __ addq(temp2, 64);
 691       if (shift) {
 692         __ shrq(temp2, shift);
 693       }
 694       __ movq(temp3, temp2);
 695       copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
 696       __ movq(temp4, temp2);
 697       __ movq(temp1, count);
 698       __ subq(temp1, temp2);
 699 
 700       __ cmpq(temp1, loop_size[shift]);
 701       __ jcc(Assembler::less, L_tail64);
 702 
 703       __ BIND(L_main_pre_loop_64bytes);
 704       __ subq(temp1, loop_size[shift]);
 705 
 706       // Main loop with aligned copy block size of 192 bytes at
 707       // 64 byte copy granularity.
 708       __ align32();
 709       __ BIND(L_main_loop_64bytes);
 710          copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
 711          copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
 712          copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
 713          __ addptr(temp4, loop_size[shift]);
 714          __ subq(temp1, loop_size[shift]);
 715          __ jcc(Assembler::greater, L_main_loop_64bytes);
 716 
 717       __ addq(temp1, loop_size[shift]);
 718       // Zero length check.
 719       __ jcc(Assembler::lessEqual, L_exit);
 720 
 721       __ BIND(L_tail64);
 722 
 723       // Tail handling using 64 byte [masked] vector copy operations.
 724       use64byteVector = true;
 725       arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 726                                    temp4, temp3, use64byteVector, L_entry, L_exit);
 727     }
 728     __ BIND(L_exit);
 729   }
 730 
 731   __ BIND(L_finish);
 732   address ucme_exit_pc = __ pc();
 733   // When called from generic_arraycopy r11 contains specific values
 734   // used during arraycopy epilogue, re-initializing r11.
 735   if (is_oop) {
 736     __ movq(r11, shift == 3 ? count : to);
 737   }
 738   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 739   restore_argument_regs(type);
 740   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 741   __ xorptr(rax, rax); // return 0
 742   __ vzeroupper();
 743   __ leave(); // required for proper stackwalking of RuntimeStub frame
 744   __ ret(0);
 745 
 746   if (MaxVectorSize == 64) {
 747     __ BIND(L_copy_large);
 748       UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
 749       arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
 750     __ jmp(L_finish);
 751   }
 752   return start;
 753 }
 754 
 755 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
 756                                          Register temp3, Register temp4, Register count,
 757                                          XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
 758                                          XMMRegister xmm4, int shift) {
 759 
 760   // Type(shift)           byte(0), short(1), int(2),   long(3)
 761   int loop_size[]        = { 256,     128,       64,      32};
 762   int threshold[]        = { 4096,    2048,     1024,    512};
 763 
 764   Label L_main_loop_large;
 765   Label L_tail_large;
 766   Label L_exit_large;
 767   Label L_entry_large;
 768   Label L_main_pre_loop_large;
 769   Label L_pre_main_post_large;
 770 
 771   assert(MaxVectorSize == 64, "vector length != 64");
 772   __ BIND(L_entry_large);
 773 
 774   __ BIND(L_pre_main_post_large);
 775   // Partial copy to make dst address 64 byte aligned.
 776   __ movq(temp2, to);
 777   __ andq(temp2, 63);
 778   __ jcc(Assembler::equal, L_main_pre_loop_large);
 779 
 780   __ negptr(temp2);
 781   __ addq(temp2, 64);
 782   if (shift) {
 783     __ shrq(temp2, shift);
 784   }
 785   __ movq(temp3, temp2);
 786   copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
 787   __ movq(temp4, temp2);
 788   __ movq(temp1, count);
 789   __ subq(temp1, temp2);
 790 
 791   __ cmpq(temp1, loop_size[shift]);
 792   __ jcc(Assembler::less, L_tail_large);
 793 
 794   __ BIND(L_main_pre_loop_large);
 795   __ subq(temp1, loop_size[shift]);
 796 
 797   // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
 798   __ align32();
 799   __ BIND(L_main_loop_large);
 800   copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
 801   __ addptr(temp4, loop_size[shift]);
 802   __ subq(temp1, loop_size[shift]);
 803   __ jcc(Assembler::greater, L_main_loop_large);
 804   // fence needed because copy256_avx3 uses non-temporal stores
 805   __ sfence();
 806 
 807   __ addq(temp1, loop_size[shift]);
 808   // Zero length check.
 809   __ jcc(Assembler::lessEqual, L_exit_large);
 810   __ BIND(L_tail_large);
 811   // Tail handling using 64 byte [masked] vector copy operations.
 812   __ cmpq(temp1, 0);
 813   __ jcc(Assembler::lessEqual, L_exit_large);
 814   arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
 815                                temp4, temp3, L_exit_large);
 816   __ BIND(L_exit_large);
 817 }
 818 
 819 // Inputs:
 820 //   c_rarg0   - source array address
 821 //   c_rarg1   - destination array address
 822 //   c_rarg2   - element count, treated as ssize_t, can be zero
 823 //
 824 //
 825 address StubGenerator::generate_conjoint_copy_avx3_masked(StubGenStubId stub_id, address* entry, address nooverlap_target) {
 826   // aligned is always false -- x86_64 always uses the unaligned code
 827   const bool aligned = false;
 828   int shift;
 829   bool is_oop;
 830   bool dest_uninitialized;
 831 
 832   switch (stub_id) {
 833   case jbyte_arraycopy_id:
 834     shift = 0;
 835     is_oop = false;
 836     dest_uninitialized = false;
 837     break;
 838   case jshort_arraycopy_id:
 839     shift = 1;
 840     is_oop = false;
 841     dest_uninitialized = false;
 842     break;
 843   case jint_arraycopy_id:
 844     shift = 2;
 845     is_oop = false;
 846     dest_uninitialized = false;
 847     break;
 848   case jlong_arraycopy_id:
 849     shift = 3;
 850     is_oop = false;
 851     dest_uninitialized = false;
 852     break;
 853   case oop_arraycopy_id:
 854     shift = (UseCompressedOops ? 2 : 3);
 855     is_oop = true;
 856     dest_uninitialized = false;
 857     break;
 858   case oop_arraycopy_uninit_id:
 859     shift = (UseCompressedOops ? 2 : 3);
 860     is_oop = true;
 861     dest_uninitialized = true;
 862     break;
 863   default:
 864     ShouldNotReachHere();
 865   }
 866 
 867   __ align(CodeEntryAlignment);
 868   StubCodeMark mark(this, stub_id);
 869   address start = __ pc();
 870 
 871   int avx3threshold = VM_Version::avx3_threshold();
 872   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 873 
 874   Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 875   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 876   const Register from        = rdi;  // source array address
 877   const Register to          = rsi;  // destination array address
 878   const Register count       = rdx;  // elements count
 879   const Register temp1       = r8;
 880   const Register temp2       = rcx;
 881   const Register temp3       = r11;
 882   const Register temp4       = rax;
 883   // End pointers are inclusive, and if count is not zero they point
 884   // to the last unit copied:  end_to[0] := end_from[0]
 885 
 886   __ enter(); // required for proper stackwalking of RuntimeStub frame
 887   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 888 
 889   if (entry != nullptr) {
 890     *entry = __ pc();
 891      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 892     BLOCK_COMMENT("Entry:");
 893   }
 894 
 895   array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
 896 
 897   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 898   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 899 
 900   setup_argument_regs(type);
 901 
 902   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 903   if (dest_uninitialized) {
 904     decorators |= IS_DEST_UNINITIALIZED;
 905   }
 906   if (aligned) {
 907     decorators |= ARRAYCOPY_ALIGNED;
 908   }
 909   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 910   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 911   {
 912     // Type(shift)       byte(0), short(1), int(2),   long(3)
 913     int loop_size[]   = { 192,     96,       48,      24};
 914     int threshold[]   = { 4096,    2048,     1024,    512};
 915 
 916     // UnsafeMemoryAccess page error: continue after unsafe access
 917     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
 918     // 'from', 'to' and 'count' are now valid
 919 
 920     // temp1 holds remaining count.
 921     __ movq(temp1, count);
 922 
 923     // Zero length check.
 924     __ BIND(L_tail);
 925     __ cmpq(temp1, 0);
 926     __ jcc(Assembler::lessEqual, L_exit);
 927 
 928     __ mov64(temp2, 0);
 929     __ movq(temp3, temp1);
 930     // Special cases using 32 byte [masked] vector copy operations.
 931     arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 932                                           temp4, use64byteVector, L_entry, L_exit);
 933 
 934     // PRE-MAIN-POST loop for aligned copy.
 935     __ BIND(L_entry);
 936 
 937     if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
 938       __ cmpq(temp1, threshold[shift]);
 939       __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 940     }
 941 
 942     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 943       // Partial copy to make dst address 32 byte aligned.
 944       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 945       __ andq(temp2, 31);
 946       __ jcc(Assembler::equal, L_main_pre_loop);
 947 
 948       if (shift) {
 949         __ shrq(temp2, shift);
 950       }
 951       __ subq(temp1, temp2);
 952       copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
 953 
 954       __ cmpq(temp1, loop_size[shift]);
 955       __ jcc(Assembler::less, L_tail);
 956 
 957       __ BIND(L_main_pre_loop);
 958 
 959       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 960       __ align32();
 961       __ BIND(L_main_loop);
 962          copy64_avx(to, from, temp1, xmm1, true, shift, -64);
 963          copy64_avx(to, from, temp1, xmm1, true, shift, -128);
 964          copy64_avx(to, from, temp1, xmm1, true, shift, -192);
 965          __ subptr(temp1, loop_size[shift]);
 966          __ cmpq(temp1, loop_size[shift]);
 967          __ jcc(Assembler::greater, L_main_loop);
 968 
 969       // Tail loop.
 970       __ jmp(L_tail);
 971     }
 972 
 973     if (MaxVectorSize > 32) {
 974       __ BIND(L_pre_main_post_64);
 975       // Partial copy to make dst address 64 byte aligned.
 976       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 977       __ andq(temp2, 63);
 978       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 979 
 980       if (shift) {
 981         __ shrq(temp2, shift);
 982       }
 983       __ subq(temp1, temp2);
 984       copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
 985 
 986       __ cmpq(temp1, loop_size[shift]);
 987       __ jcc(Assembler::less, L_tail64);
 988 
 989       __ BIND(L_main_pre_loop_64bytes);
 990 
 991       // Main loop with aligned copy block size of 192 bytes at
 992       // 64 byte copy granularity.
 993       __ align32();
 994       __ BIND(L_main_loop_64bytes);
 995          copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
 996          copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
 997          copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
 998          __ subq(temp1, loop_size[shift]);
 999          __ cmpq(temp1, loop_size[shift]);
1000          __ jcc(Assembler::greater, L_main_loop_64bytes);
1001 
1002       // Zero length check.
1003       __ cmpq(temp1, 0);
1004       __ jcc(Assembler::lessEqual, L_exit);
1005 
1006       __ BIND(L_tail64);
1007 
1008       // Tail handling using 64 byte [masked] vector copy operations.
1009       use64byteVector = true;
1010       __ mov64(temp2, 0);
1011       __ movq(temp3, temp1);
1012       arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1013                                             temp4, use64byteVector, L_entry, L_exit);
1014     }
1015     __ BIND(L_exit);
1016   }
1017   address ucme_exit_pc = __ pc();
1018   // When called from generic_arraycopy r11 contains specific values
1019   // used during arraycopy epilogue, re-initializing r11.
1020   if(is_oop) {
1021     __ movq(r11, count);
1022   }
1023   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1024   restore_argument_regs(type);
1025   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
1026   __ xorptr(rax, rax); // return 0
1027   __ vzeroupper();
1028   __ leave(); // required for proper stackwalking of RuntimeStub frame
1029   __ ret(0);
1030 
1031   return start;
1032 }
1033 
1034 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
1035                                                  Register to, Register count, int shift,
1036                                                  Register index, Register temp,
1037                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
1038   Label L_entry_64, L_entry_96, L_entry_128;
1039   Label L_entry_160, L_entry_192;
1040 
1041   int size_mat[][6] = {
1042   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1043   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1044   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1045   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1046   };
1047 
1048   // Case A) Special case for length less than equal to 32 bytes.
1049   __ cmpq(count, size_mat[shift][0]);
1050   __ jccb(Assembler::greater, L_entry_64);
1051   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
1052   __ jmp(L_exit);
1053 
1054   // Case B) Special case for length less than equal to 64 bytes.
1055   __ BIND(L_entry_64);
1056   __ cmpq(count, size_mat[shift][1]);
1057   __ jccb(Assembler::greater, L_entry_96);
1058   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1059   __ jmp(L_exit);
1060 
1061   // Case C) Special case for length less than equal to 96 bytes.
1062   __ BIND(L_entry_96);
1063   __ cmpq(count, size_mat[shift][2]);
1064   __ jccb(Assembler::greater, L_entry_128);
1065   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1066   __ subq(count, 64 >> shift);
1067   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1068   __ jmp(L_exit);
1069 
1070   // Case D) Special case for length less than equal to 128 bytes.
1071   __ BIND(L_entry_128);
1072   __ cmpq(count, size_mat[shift][3]);
1073   __ jccb(Assembler::greater, L_entry_160);
1074   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1075   copy32_avx(to, from, index, xmm, shift, 64);
1076   __ subq(count, 96 >> shift);
1077   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1078   __ jmp(L_exit);
1079 
1080   // Case E) Special case for length less than equal to 160 bytes.
1081   __ BIND(L_entry_160);
1082   __ cmpq(count, size_mat[shift][4]);
1083   __ jccb(Assembler::greater, L_entry_192);
1084   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1085   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1086   __ subq(count, 128 >> shift);
1087   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1088   __ jmp(L_exit);
1089 
1090   // Case F) Special case for length less than equal to 192 bytes.
1091   __ BIND(L_entry_192);
1092   __ cmpq(count, size_mat[shift][5]);
1093   __ jcc(Assembler::greater, L_entry);
1094   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1095   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1096   copy32_avx(to, from, index, xmm, shift, 128);
1097   __ subq(count, 160 >> shift);
1098   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1099   __ jmp(L_exit);
1100 }
1101 
1102 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1103                                                      Register to, Register count, int shift, Register index,
1104                                                      Register temp, Label& L_exit) {
1105   Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1106 
1107   int size_mat[][4] = {
1108   /* T_BYTE */ {64, 128, 192, 256},
1109   /* T_SHORT*/ {32, 64 , 96 , 128},
1110   /* T_INT  */ {16, 32 , 48 ,  64},
1111   /* T_LONG */ { 8, 16 , 24 ,  32}
1112   };
1113 
1114   assert(MaxVectorSize == 64, "vector length != 64");
1115   // Case A) Special case for length less than or equal to 64 bytes.
1116   __ BIND(L_entry_64);
1117   __ cmpq(count, size_mat[shift][0]);
1118   __ jccb(Assembler::greater, L_entry_128);
1119   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1120   __ jmp(L_exit);
1121 
1122   // Case B) Special case for length less than or equal to 128 bytes.
1123   __ BIND(L_entry_128);
1124   __ cmpq(count, size_mat[shift][1]);
1125   __ jccb(Assembler::greater, L_entry_192);
1126   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1127   __ subq(count, 64 >> shift);
1128   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1129   __ jmp(L_exit);
1130 
1131   // Case C) Special case for length less than or equal to 192 bytes.
1132   __ BIND(L_entry_192);
1133   __ cmpq(count, size_mat[shift][2]);
1134   __ jcc(Assembler::greater, L_entry_256);
1135   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1136   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1137   __ subq(count, 128 >> shift);
1138   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1139   __ jmp(L_exit);
1140 
1141   // Case D) Special case for length less than or equal to 256 bytes.
1142   __ BIND(L_entry_256);
1143   copy64_avx(to, from, index, xmm, false, shift, 0, true);
1144   copy64_avx(to, from, index, xmm, false, shift, 64, true);
1145   copy64_avx(to, from, index, xmm, false, shift, 128, true);
1146   __ subq(count, 192 >> shift);
1147   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1148   __ jmp(L_exit);
1149 }
1150 
1151 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1152                                                            Register to, Register start_index, Register end_index,
1153                                                            Register count, int shift, Register temp,
1154                                                            bool use64byteVector, Label& L_entry, Label& L_exit) {
1155   Label L_entry_64, L_entry_96, L_entry_128;
1156   Label L_entry_160, L_entry_192;
1157   bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
1158 
1159   int size_mat[][6] = {
1160   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
1161   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
1162   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
1163   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
1164   };
1165 
1166   // Case A) Special case for length less than equal to 32 bytes.
1167   __ cmpq(count, size_mat[shift][0]);
1168   __ jccb(Assembler::greater, L_entry_64);
1169   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1170   __ jmp(L_exit);
1171 
1172   // Case B) Special case for length less than equal to 64 bytes.
1173   __ BIND(L_entry_64);
1174   __ cmpq(count, size_mat[shift][1]);
1175   __ jccb(Assembler::greater, L_entry_96);
1176   if (avx3) {
1177      copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1178   } else {
1179      copy32_avx(to, from, end_index, xmm, shift, -32);
1180      __ subq(count, 32 >> shift);
1181      copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1182   }
1183   __ jmp(L_exit);
1184 
1185   // Case C) Special case for length less than equal to 96 bytes.
1186   __ BIND(L_entry_96);
1187   __ cmpq(count, size_mat[shift][2]);
1188   __ jccb(Assembler::greater, L_entry_128);
1189   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1190   __ subq(count, 64 >> shift);
1191   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1192   __ jmp(L_exit);
1193 
1194   // Case D) Special case for length less than equal to 128 bytes.
1195   __ BIND(L_entry_128);
1196   __ cmpq(count, size_mat[shift][3]);
1197   __ jccb(Assembler::greater, L_entry_160);
1198   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1199   copy32_avx(to, from, end_index, xmm, shift, -96);
1200   __ subq(count, 96 >> shift);
1201   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1202   __ jmp(L_exit);
1203 
1204   // Case E) Special case for length less than equal to 160 bytes.
1205   __ BIND(L_entry_160);
1206   __ cmpq(count, size_mat[shift][4]);
1207   __ jccb(Assembler::greater, L_entry_192);
1208   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1209   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1210   __ subq(count, 128 >> shift);
1211   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1212   __ jmp(L_exit);
1213 
1214   // Case F) Special case for length less than equal to 192 bytes.
1215   __ BIND(L_entry_192);
1216   __ cmpq(count, size_mat[shift][5]);
1217   __ jcc(Assembler::greater, L_entry);
1218   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1219   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1220   copy32_avx(to, from, end_index, xmm, shift, -160);
1221   __ subq(count, 160 >> shift);
1222   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1223   __ jmp(L_exit);
1224 }
1225 
1226 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1227                                 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1228                                 int shift, int offset) {
1229   if (MaxVectorSize == 64) {
1230     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1231     __ prefetcht0(Address(src, index, scale, offset + 0x200));
1232     __ prefetcht0(Address(src, index, scale, offset + 0x240));
1233     __ prefetcht0(Address(src, index, scale, offset + 0x280));
1234     __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1235 
1236     __ prefetcht0(Address(src, index, scale, offset + 0x400));
1237     __ prefetcht0(Address(src, index, scale, offset + 0x440));
1238     __ prefetcht0(Address(src, index, scale, offset + 0x480));
1239     __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1240 
1241     __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1242     __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1243     __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1244     __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1245 
1246     __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1247     __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1248     __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1249     __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1250   }
1251 }
1252 
1253 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1254                                        KRegister mask, Register length, Register index,
1255                                        Register temp, int shift, int offset,
1256                                        bool use64byteVector) {
1257   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1258   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1259   if (!use64byteVector) {
1260     copy32_avx(dst, src, index, xmm, shift, offset);
1261     __ subptr(length, 32 >> shift);
1262     copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1263   } else {
1264     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1265     assert(MaxVectorSize == 64, "vector length != 64");
1266     __ mov64(temp, -1L);
1267     __ bzhiq(temp, temp, length);
1268     __ kmovql(mask, temp);
1269     __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1270     __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1271   }
1272 }
1273 
1274 
1275 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1276                                        KRegister mask, Register length, Register index,
1277                                        Register temp, int shift, int offset) {
1278   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1279   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1280   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1281   __ mov64(temp, -1L);
1282   __ bzhiq(temp, temp, length);
1283   __ kmovql(mask, temp);
1284   __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1285   __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1286 }
1287 
1288 
1289 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1290                                 int shift, int offset) {
1291   assert(MaxVectorSize >= 32, "vector length should be >= 32");
1292   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1293   __ vmovdqu(xmm, Address(src, index, scale, offset));
1294   __ vmovdqu(Address(dst, index, scale, offset), xmm);
1295 }
1296 
1297 
1298 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1299                                 bool conjoint, int shift, int offset, bool use64byteVector) {
1300   assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1301   if (!use64byteVector) {
1302     if (conjoint) {
1303       copy32_avx(dst, src, index, xmm, shift, offset+32);
1304       copy32_avx(dst, src, index, xmm, shift, offset);
1305     } else {
1306       copy32_avx(dst, src, index, xmm, shift, offset);
1307       copy32_avx(dst, src, index, xmm, shift, offset+32);
1308     }
1309   } else {
1310     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1311     __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1312     __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1313   }
1314 }
1315 
1316 #endif // COMPILER2_OR_JVMCI
1317 
1318 
1319 // Arguments:
1320 //   entry     - location for return of (post-push) entry
1321 //
1322 // Inputs:
1323 //   c_rarg0   - source array address
1324 //   c_rarg1   - destination array address
1325 //   c_rarg2   - element count, treated as ssize_t, can be zero
1326 //
1327 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1328 // we let the hardware handle it.  The one to eight bytes within words,
1329 // dwords or qwords that span cache line boundaries will still be loaded
1330 // and stored atomically.
1331 //
1332 // Side Effects:
1333 //   entry is set to the no-overlap entry point
1334 //   used by generate_conjoint_byte_copy().
1335 //
1336 address StubGenerator::generate_disjoint_byte_copy(address* entry) {
1337   StubGenStubId stub_id = StubGenStubId::jbyte_disjoint_arraycopy_id;
1338   // aligned is always false -- x86_64 always uses the unaligned code
1339   const bool aligned = false;
1340 #if COMPILER2_OR_JVMCI
1341   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1342     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1343   }
1344 #endif
1345   __ align(CodeEntryAlignment);
1346   StubCodeMark mark(this, stub_id);
1347   address start = __ pc();
1348   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1349 
1350   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1351   Label L_copy_byte, L_exit;
1352   const Register from        = rdi;  // source array address
1353   const Register to          = rsi;  // destination array address
1354   const Register count       = rdx;  // elements count
1355   const Register byte_count  = rcx;
1356   const Register qword_count = count;
1357   const Register end_from    = from; // source array end address
1358   const Register end_to      = to;   // destination array end address
1359   // End pointers are inclusive, and if count is not zero they point
1360   // to the last unit copied:  end_to[0] := end_from[0]
1361 
1362   __ enter(); // required for proper stackwalking of RuntimeStub frame
1363   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1364 
1365   if (entry != nullptr) {
1366     *entry = __ pc();
1367      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1368     BLOCK_COMMENT("Entry:");
1369   }
1370 
1371   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1372                     // r9 and r10 may be used to save non-volatile registers
1373 
1374   {
1375     // UnsafeMemoryAccess page error: continue after unsafe access
1376     UnsafeMemoryAccessMark umam(this, !aligned, true);
1377     // 'from', 'to' and 'count' are now valid
1378     __ movptr(byte_count, count);
1379     __ shrptr(count, 3); // count => qword_count
1380 
1381     // Copy from low to high addresses.  Use 'to' as scratch.
1382     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1383     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1384     __ negptr(qword_count); // make the count negative
1385     __ jmp(L_copy_bytes);
1386 
1387     // Copy trailing qwords
1388   __ BIND(L_copy_8_bytes);
1389     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1390     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1391     __ increment(qword_count);
1392     __ jcc(Assembler::notZero, L_copy_8_bytes);
1393 
1394     // Check for and copy trailing dword
1395   __ BIND(L_copy_4_bytes);
1396     __ testl(byte_count, 4);
1397     __ jccb(Assembler::zero, L_copy_2_bytes);
1398     __ movl(rax, Address(end_from, 8));
1399     __ movl(Address(end_to, 8), rax);
1400 
1401     __ addptr(end_from, 4);
1402     __ addptr(end_to, 4);
1403 
1404     // Check for and copy trailing word
1405   __ BIND(L_copy_2_bytes);
1406     __ testl(byte_count, 2);
1407     __ jccb(Assembler::zero, L_copy_byte);
1408     __ movw(rax, Address(end_from, 8));
1409     __ movw(Address(end_to, 8), rax);
1410 
1411     __ addptr(end_from, 2);
1412     __ addptr(end_to, 2);
1413 
1414     // Check for and copy trailing byte
1415   __ BIND(L_copy_byte);
1416     __ testl(byte_count, 1);
1417     __ jccb(Assembler::zero, L_exit);
1418     __ movb(rax, Address(end_from, 8));
1419     __ movb(Address(end_to, 8), rax);
1420   }
1421 __ BIND(L_exit);
1422   address ucme_exit_pc = __ pc();
1423   restore_arg_regs();
1424   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1425   __ xorptr(rax, rax); // return 0
1426   __ vzeroupper();
1427   __ leave(); // required for proper stackwalking of RuntimeStub frame
1428   __ ret(0);
1429 
1430   {
1431     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1432     // Copy in multi-bytes chunks
1433     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1434     __ jmp(L_copy_4_bytes);
1435   }
1436   return start;
1437 }
1438 
1439 
1440 // Arguments:
1441 //   entry     - location for return of (post-push) entry
1442 //   nooverlap_target - entry to branch to if no overlap detected
1443 //
1444 // Inputs:
1445 //   c_rarg0   - source array address
1446 //   c_rarg1   - destination array address
1447 //   c_rarg2   - element count, treated as ssize_t, can be zero
1448 //
1449 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1450 // we let the hardware handle it.  The one to eight bytes within words,
1451 // dwords or qwords that span cache line boundaries will still be loaded
1452 // and stored atomically.
1453 //
1454 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) {
1455   StubGenStubId stub_id = StubGenStubId::jbyte_arraycopy_id;
1456   // aligned is always false -- x86_64 always uses the unaligned code
1457   const bool aligned = false;
1458 #if COMPILER2_OR_JVMCI
1459   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1460     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1461   }
1462 #endif
1463   __ align(CodeEntryAlignment);
1464   StubCodeMark mark(this, stub_id);
1465   address start = __ pc();
1466   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1467 
1468   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1469   const Register from        = rdi;  // source array address
1470   const Register to          = rsi;  // destination array address
1471   const Register count       = rdx;  // elements count
1472   const Register byte_count  = rcx;
1473   const Register qword_count = count;
1474 
1475   __ enter(); // required for proper stackwalking of RuntimeStub frame
1476   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1477 
1478   if (entry != nullptr) {
1479     *entry = __ pc();
1480     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1481     BLOCK_COMMENT("Entry:");
1482   }
1483 
1484   array_overlap_test(nooverlap_target, Address::times_1);
1485   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1486                     // r9 and r10 may be used to save non-volatile registers
1487 
1488   {
1489     // UnsafeMemoryAccess page error: continue after unsafe access
1490     UnsafeMemoryAccessMark umam(this, !aligned, true);
1491     // 'from', 'to' and 'count' are now valid
1492     __ movptr(byte_count, count);
1493     __ shrptr(count, 3);   // count => qword_count
1494 
1495     // Copy from high to low addresses.
1496 
1497     // Check for and copy trailing byte
1498     __ testl(byte_count, 1);
1499     __ jcc(Assembler::zero, L_copy_2_bytes);
1500     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1501     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1502     __ decrement(byte_count); // Adjust for possible trailing word
1503 
1504     // Check for and copy trailing word
1505   __ BIND(L_copy_2_bytes);
1506     __ testl(byte_count, 2);
1507     __ jcc(Assembler::zero, L_copy_4_bytes);
1508     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1509     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1510 
1511     // Check for and copy trailing dword
1512   __ BIND(L_copy_4_bytes);
1513     __ testl(byte_count, 4);
1514     __ jcc(Assembler::zero, L_copy_bytes);
1515     __ movl(rax, Address(from, qword_count, Address::times_8));
1516     __ movl(Address(to, qword_count, Address::times_8), rax);
1517     __ jmp(L_copy_bytes);
1518 
1519     // Copy trailing qwords
1520   __ BIND(L_copy_8_bytes);
1521     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1522     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1523     __ decrement(qword_count);
1524     __ jcc(Assembler::notZero, L_copy_8_bytes);
1525   }
1526   restore_arg_regs();
1527   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1528   __ xorptr(rax, rax); // return 0
1529   __ vzeroupper();
1530   __ leave(); // required for proper stackwalking of RuntimeStub frame
1531   __ ret(0);
1532 
1533   {
1534     // UnsafeMemoryAccess page error: continue after unsafe access
1535     UnsafeMemoryAccessMark umam(this, !aligned, true);
1536     // Copy in multi-bytes chunks
1537     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1538   }
1539   restore_arg_regs();
1540   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1541   __ xorptr(rax, rax); // return 0
1542   __ vzeroupper();
1543   __ leave(); // required for proper stackwalking of RuntimeStub frame
1544   __ ret(0);
1545 
1546   return start;
1547 }
1548 
1549 
1550 // Arguments:
1551 //   entry     - location for return of (post-push) entry
1552 //
1553 // Inputs:
1554 //   c_rarg0   - source array address
1555 //   c_rarg1   - destination array address
1556 //   c_rarg2   - element count, treated as ssize_t, can be zero
1557 //
1558 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1559 // let the hardware handle it.  The two or four words within dwords
1560 // or qwords that span cache line boundaries will still be loaded
1561 // and stored atomically.
1562 //
1563 // Side Effects:
1564 //   entry is set to the no-overlap entry point
1565 //   used by generate_conjoint_short_copy().
1566 //
1567 address StubGenerator::generate_disjoint_short_copy(address *entry) {
1568   StubGenStubId stub_id = StubGenStubId::jshort_disjoint_arraycopy_id;
1569   // aligned is always false -- x86_64 always uses the unaligned code
1570   const bool aligned = false;
1571 #if COMPILER2_OR_JVMCI
1572   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1573     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1574   }
1575 #endif
1576 
1577   __ align(CodeEntryAlignment);
1578   StubCodeMark mark(this, stub_id);
1579   address start = __ pc();
1580   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1581 
1582   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1583   const Register from        = rdi;  // source array address
1584   const Register to          = rsi;  // destination array address
1585   const Register count       = rdx;  // elements count
1586   const Register word_count  = rcx;
1587   const Register qword_count = count;
1588   const Register end_from    = from; // source array end address
1589   const Register end_to      = to;   // destination array end address
1590   // End pointers are inclusive, and if count is not zero they point
1591   // to the last unit copied:  end_to[0] := end_from[0]
1592 
1593   __ enter(); // required for proper stackwalking of RuntimeStub frame
1594   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1595 
1596   if (entry != nullptr) {
1597     *entry = __ pc();
1598     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1599     BLOCK_COMMENT("Entry:");
1600   }
1601 
1602   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1603                     // r9 and r10 may be used to save non-volatile registers
1604 
1605   {
1606     // UnsafeMemoryAccess page error: continue after unsafe access
1607     UnsafeMemoryAccessMark umam(this, !aligned, true);
1608     // 'from', 'to' and 'count' are now valid
1609     __ movptr(word_count, count);
1610     __ shrptr(count, 2); // count => qword_count
1611 
1612     // Copy from low to high addresses.  Use 'to' as scratch.
1613     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1614     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1615     __ negptr(qword_count);
1616     __ jmp(L_copy_bytes);
1617 
1618     // Copy trailing qwords
1619   __ BIND(L_copy_8_bytes);
1620     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1621     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1622     __ increment(qword_count);
1623     __ jcc(Assembler::notZero, L_copy_8_bytes);
1624 
1625     // Original 'dest' is trashed, so we can't use it as a
1626     // base register for a possible trailing word copy
1627 
1628     // Check for and copy trailing dword
1629   __ BIND(L_copy_4_bytes);
1630     __ testl(word_count, 2);
1631     __ jccb(Assembler::zero, L_copy_2_bytes);
1632     __ movl(rax, Address(end_from, 8));
1633     __ movl(Address(end_to, 8), rax);
1634 
1635     __ addptr(end_from, 4);
1636     __ addptr(end_to, 4);
1637 
1638     // Check for and copy trailing word
1639   __ BIND(L_copy_2_bytes);
1640     __ testl(word_count, 1);
1641     __ jccb(Assembler::zero, L_exit);
1642     __ movw(rax, Address(end_from, 8));
1643     __ movw(Address(end_to, 8), rax);
1644   }
1645 __ BIND(L_exit);
1646   address ucme_exit_pc = __ pc();
1647   restore_arg_regs();
1648   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1649   __ xorptr(rax, rax); // return 0
1650   __ vzeroupper();
1651   __ leave(); // required for proper stackwalking of RuntimeStub frame
1652   __ ret(0);
1653 
1654   {
1655     UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1656     // Copy in multi-bytes chunks
1657     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1658     __ jmp(L_copy_4_bytes);
1659   }
1660 
1661   return start;
1662 }
1663 
1664 
1665 address StubGenerator::generate_fill(StubGenStubId stub_id) {
1666   BasicType t;
1667   bool aligned;
1668 
1669   switch (stub_id) {
1670   case jbyte_fill_id:
1671     t = T_BYTE;
1672     aligned = false;
1673     break;
1674   case jshort_fill_id:
1675     t = T_SHORT;
1676     aligned = false;
1677     break;
1678   case jint_fill_id:
1679     t = T_INT;
1680     aligned = false;
1681     break;
1682   case arrayof_jbyte_fill_id:
1683     t = T_BYTE;
1684     aligned = true;
1685     break;
1686   case arrayof_jshort_fill_id:
1687     t = T_SHORT;
1688     aligned = true;
1689     break;
1690   case arrayof_jint_fill_id:
1691     t = T_INT;
1692     aligned = true;
1693     break;
1694   default:
1695     ShouldNotReachHere();
1696   }
1697 
1698   __ align(CodeEntryAlignment);
1699   StubCodeMark mark(this, stub_id);
1700   address start = __ pc();
1701 
1702   BLOCK_COMMENT("Entry:");
1703 
1704   const Register to       = c_rarg0;  // destination array address
1705   const Register value    = c_rarg1;  // value
1706   const Register count    = c_rarg2;  // elements count
1707   __ mov(r11, count);
1708 
1709   __ enter(); // required for proper stackwalking of RuntimeStub frame
1710 
1711   {
1712     // Add set memory mark to protect against unsafe accesses faulting
1713     UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true);
1714     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1715   }
1716 
1717   __ vzeroupper();
1718   __ leave(); // required for proper stackwalking of RuntimeStub frame
1719   __ ret(0);
1720 
1721   return start;
1722 }
1723 
1724 
1725 // Arguments:
1726 //   entry     - location for return of (post-push) entry
1727 //   nooverlap_target - entry to branch to if no overlap detected
1728 //
1729 // Inputs:
1730 //   c_rarg0   - source array address
1731 //   c_rarg1   - destination array address
1732 //   c_rarg2   - element count, treated as ssize_t, can be zero
1733 //
1734 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1735 // let the hardware handle it.  The two or four words within dwords
1736 // or qwords that span cache line boundaries will still be loaded
1737 // and stored atomically.
1738 //
1739 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) {
1740   StubGenStubId stub_id = StubGenStubId::jshort_arraycopy_id;
1741   // aligned is always false -- x86_64 always uses the unaligned code
1742   const bool aligned = false;
1743 #if COMPILER2_OR_JVMCI
1744   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1745     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1746   }
1747 #endif
1748 
1749   __ align(CodeEntryAlignment);
1750   StubCodeMark mark(this, stub_id);
1751   address start = __ pc();
1752   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1753 
1754   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1755   const Register from        = rdi;  // source array address
1756   const Register to          = rsi;  // destination array address
1757   const Register count       = rdx;  // elements count
1758   const Register word_count  = rcx;
1759   const Register qword_count = count;
1760 
1761   __ enter(); // required for proper stackwalking of RuntimeStub frame
1762   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1763 
1764   if (entry != nullptr) {
1765     *entry = __ pc();
1766     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1767     BLOCK_COMMENT("Entry:");
1768   }
1769 
1770   array_overlap_test(nooverlap_target, Address::times_2);
1771   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1772                     // r9 and r10 may be used to save non-volatile registers
1773 
1774   {
1775     // UnsafeMemoryAccess page error: continue after unsafe access
1776     UnsafeMemoryAccessMark umam(this, !aligned, true);
1777     // 'from', 'to' and 'count' are now valid
1778     __ movptr(word_count, count);
1779     __ shrptr(count, 2); // count => qword_count
1780 
1781     // Copy from high to low addresses.  Use 'to' as scratch.
1782 
1783     // Check for and copy trailing word
1784     __ testl(word_count, 1);
1785     __ jccb(Assembler::zero, L_copy_4_bytes);
1786     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1787     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1788 
1789    // Check for and copy trailing dword
1790   __ BIND(L_copy_4_bytes);
1791     __ testl(word_count, 2);
1792     __ jcc(Assembler::zero, L_copy_bytes);
1793     __ movl(rax, Address(from, qword_count, Address::times_8));
1794     __ movl(Address(to, qword_count, Address::times_8), rax);
1795     __ jmp(L_copy_bytes);
1796 
1797     // Copy trailing qwords
1798   __ BIND(L_copy_8_bytes);
1799     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1800     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1801     __ decrement(qword_count);
1802     __ jcc(Assembler::notZero, L_copy_8_bytes);
1803   }
1804   restore_arg_regs();
1805   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1806   __ xorptr(rax, rax); // return 0
1807   __ vzeroupper();
1808   __ leave(); // required for proper stackwalking of RuntimeStub frame
1809   __ ret(0);
1810 
1811   {
1812     // UnsafeMemoryAccess page error: continue after unsafe access
1813     UnsafeMemoryAccessMark umam(this, !aligned, true);
1814     // Copy in multi-bytes chunks
1815     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1816   }
1817   restore_arg_regs();
1818   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1819   __ xorptr(rax, rax); // return 0
1820   __ vzeroupper();
1821   __ leave(); // required for proper stackwalking of RuntimeStub frame
1822   __ ret(0);
1823 
1824   return start;
1825 }
1826 
1827 
1828 // Arguments:
1829 //   stub_id   - unqiue id for stub to generate
1830 //   entry     - location for return of (post-push) entry
1831 //   is_oop    - true => oop array, so generate store check code
1832 //
1833 // Inputs:
1834 //   c_rarg0   - source array address
1835 //   c_rarg1   - destination array address
1836 //   c_rarg2   - element count, treated as ssize_t, can be zero
1837 //
1838 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1839 // the hardware handle it.  The two dwords within qwords that span
1840 // cache line boundaries will still be loaded and stored atomically.
1841 //
1842 // Side Effects:
1843 //   disjoint_int_copy_entry is set to the no-overlap entry point
1844 //   used by generate_conjoint_int_oop_copy().
1845 //
1846 address StubGenerator::generate_disjoint_int_oop_copy(StubGenStubId stub_id, address* entry) {
1847   // aligned is always false -- x86_64 always uses the unaligned code
1848   const bool aligned = false;
1849   bool is_oop;
1850   bool dest_uninitialized;
1851   switch (stub_id) {
1852   case StubGenStubId::jint_disjoint_arraycopy_id:
1853     is_oop = false;
1854     dest_uninitialized = false;
1855     break;
1856   case StubGenStubId::oop_disjoint_arraycopy_id:
1857     assert(UseCompressedOops, "inconsistent oop copy size!");
1858     is_oop = true;
1859     dest_uninitialized = false;
1860     break;
1861   case StubGenStubId::oop_disjoint_arraycopy_uninit_id:
1862     assert(UseCompressedOops, "inconsistent oop copy size!");
1863     is_oop = true;
1864     dest_uninitialized = true;
1865     break;
1866   default:
1867     ShouldNotReachHere();
1868   }
1869 
1870   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1871 #if COMPILER2_OR_JVMCI
1872   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1873     return generate_disjoint_copy_avx3_masked(stub_id, entry);
1874   }
1875 #endif
1876 
1877   __ align(CodeEntryAlignment);
1878   StubCodeMark mark(this, stub_id);
1879   address start = __ pc();
1880 
1881   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1882   const Register from        = rdi;  // source array address
1883   const Register to          = rsi;  // destination array address
1884   const Register count       = rdx;  // elements count
1885   const Register dword_count = rcx;
1886   const Register qword_count = count;
1887   const Register end_from    = from; // source array end address
1888   const Register end_to      = to;   // destination array end address
1889   // End pointers are inclusive, and if count is not zero they point
1890   // to the last unit copied:  end_to[0] := end_from[0]
1891 
1892   __ enter(); // required for proper stackwalking of RuntimeStub frame
1893   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1894 
1895   if (entry != nullptr) {
1896     *entry = __ pc();
1897     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1898     BLOCK_COMMENT("Entry:");
1899   }
1900 
1901   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1902                                  // r9 is used to save r15_thread
1903 
1904   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1905   if (dest_uninitialized) {
1906     decorators |= IS_DEST_UNINITIALIZED;
1907   }
1908   if (aligned) {
1909     decorators |= ARRAYCOPY_ALIGNED;
1910   }
1911 
1912   BasicType type = is_oop ? T_OBJECT : T_INT;
1913   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1914 
1915   {
1916     // UnsafeMemoryAccess page error: continue after unsafe access
1917     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1918     // 'from', 'to' and 'count' are now valid
1919     __ movptr(dword_count, count);
1920     __ shrptr(count, 1); // count => qword_count
1921 
1922     // Copy from low to high addresses.  Use 'to' as scratch.
1923     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1924     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1925     __ negptr(qword_count);
1926     __ jmp(L_copy_bytes);
1927 
1928     // Copy trailing qwords
1929   __ BIND(L_copy_8_bytes);
1930     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1931     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1932     __ increment(qword_count);
1933     __ jcc(Assembler::notZero, L_copy_8_bytes);
1934 
1935     // Check for and copy trailing dword
1936   __ BIND(L_copy_4_bytes);
1937     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1938     __ jccb(Assembler::zero, L_exit);
1939     __ movl(rax, Address(end_from, 8));
1940     __ movl(Address(end_to, 8), rax);
1941   }
1942 __ BIND(L_exit);
1943   address ucme_exit_pc = __ pc();
1944   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1945   restore_arg_regs_using_thread();
1946   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1947   __ vzeroupper();
1948   __ xorptr(rax, rax); // return 0
1949   __ leave(); // required for proper stackwalking of RuntimeStub frame
1950   __ ret(0);
1951 
1952   {
1953     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
1954     // Copy in multi-bytes chunks
1955     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
1956     __ jmp(L_copy_4_bytes);
1957   }
1958 
1959   return start;
1960 }
1961 
1962 
1963 // Arguments:
1964 //   entry     - location for return of (post-push) entry
1965 //   nooverlap_target - entry to branch to if no overlap detected
1966 //   is_oop  - true => oop array, so generate store check code
1967 //
1968 // Inputs:
1969 //   c_rarg0   - source array address
1970 //   c_rarg1   - destination array address
1971 //   c_rarg2   - element count, treated as ssize_t, can be zero
1972 //
1973 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1974 // the hardware handle it.  The two dwords within qwords that span
1975 // cache line boundaries will still be loaded and stored atomically.
1976 //
1977 address StubGenerator::generate_conjoint_int_oop_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
1978   // aligned is always false -- x86_64 always uses the unaligned code
1979   const bool aligned = false;
1980   bool is_oop;
1981   bool dest_uninitialized;
1982   switch (stub_id) {
1983   case StubGenStubId::jint_arraycopy_id:
1984     is_oop = false;
1985     dest_uninitialized = false;
1986     break;
1987   case StubGenStubId::oop_arraycopy_id:
1988     assert(UseCompressedOops, "inconsistent oop copy size!");
1989     is_oop = true;
1990     dest_uninitialized = false;
1991     break;
1992   case StubGenStubId::oop_arraycopy_uninit_id:
1993     assert(UseCompressedOops, "inconsistent oop copy size!");
1994     is_oop = true;
1995     dest_uninitialized = true;
1996     break;
1997   default:
1998     ShouldNotReachHere();
1999   }
2000 
2001   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2002 #if COMPILER2_OR_JVMCI
2003   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2004     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2005   }
2006 #endif
2007 
2008   __ align(CodeEntryAlignment);
2009   StubCodeMark mark(this, stub_id);
2010   address start = __ pc();
2011 
2012   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2013   const Register from        = rdi;  // source array address
2014   const Register to          = rsi;  // destination array address
2015   const Register count       = rdx;  // elements count
2016   const Register dword_count = rcx;
2017   const Register qword_count = count;
2018 
2019   __ enter(); // required for proper stackwalking of RuntimeStub frame
2020   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2021 
2022   if (entry != nullptr) {
2023     *entry = __ pc();
2024      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2025     BLOCK_COMMENT("Entry:");
2026   }
2027 
2028   array_overlap_test(nooverlap_target, Address::times_4);
2029   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2030                                  // r9 is used to save r15_thread
2031 
2032   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2033   if (dest_uninitialized) {
2034     decorators |= IS_DEST_UNINITIALIZED;
2035   }
2036   if (aligned) {
2037     decorators |= ARRAYCOPY_ALIGNED;
2038   }
2039 
2040   BasicType type = is_oop ? T_OBJECT : T_INT;
2041   // no registers are destroyed by this call
2042   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2043 
2044   assert_clean_int(count, rax); // Make sure 'count' is clean int.
2045   {
2046     // UnsafeMemoryAccess page error: continue after unsafe access
2047     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2048     // 'from', 'to' and 'count' are now valid
2049     __ movptr(dword_count, count);
2050     __ shrptr(count, 1); // count => qword_count
2051 
2052     // Copy from high to low addresses.  Use 'to' as scratch.
2053 
2054     // Check for and copy trailing dword
2055     __ testl(dword_count, 1);
2056     __ jcc(Assembler::zero, L_copy_bytes);
2057     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2058     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2059     __ jmp(L_copy_bytes);
2060 
2061     // Copy trailing qwords
2062   __ BIND(L_copy_8_bytes);
2063     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2064     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2065     __ decrement(qword_count);
2066     __ jcc(Assembler::notZero, L_copy_8_bytes);
2067   }
2068   if (is_oop) {
2069     __ jmp(L_exit);
2070   }
2071   restore_arg_regs_using_thread();
2072   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2073   __ xorptr(rax, rax); // return 0
2074   __ vzeroupper();
2075   __ leave(); // required for proper stackwalking of RuntimeStub frame
2076   __ ret(0);
2077 
2078   {
2079     // UnsafeMemoryAccess page error: continue after unsafe access
2080     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2081     // Copy in multi-bytes chunks
2082     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2083   }
2084 
2085 __ BIND(L_exit);
2086   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2087   restore_arg_regs_using_thread();
2088   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2089   __ xorptr(rax, rax); // return 0
2090   __ vzeroupper();
2091   __ leave(); // required for proper stackwalking of RuntimeStub frame
2092   __ ret(0);
2093 
2094   return start;
2095 }
2096 
2097 
2098 // Arguments:
2099 //   entry     - location for return of (post-push) entry
2100 //
2101 // Inputs:
2102 //   c_rarg0   - source array address
2103 //   c_rarg1   - destination array address
2104 //   c_rarg2   - element count, treated as ssize_t, can be zero
2105 //
2106  // Side Effects:
2107 //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2108 //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2109 //
2110 address StubGenerator::generate_disjoint_long_oop_copy(StubGenStubId stub_id, address *entry) {
2111   // aligned is always false -- x86_64 always uses the unaligned code
2112   const bool aligned = false;
2113   bool is_oop;
2114   bool dest_uninitialized;
2115   switch (stub_id) {
2116   case StubGenStubId::jlong_disjoint_arraycopy_id:
2117     is_oop = false;
2118     dest_uninitialized = false;
2119     break;
2120   case StubGenStubId::oop_disjoint_arraycopy_id:
2121     assert(!UseCompressedOops, "inconsistent oop copy size!");
2122     is_oop = true;
2123     dest_uninitialized = false;
2124     break;
2125   case StubGenStubId::oop_disjoint_arraycopy_uninit_id:
2126     assert(!UseCompressedOops, "inconsistent oop copy size!");
2127     is_oop = true;
2128     dest_uninitialized = true;
2129     break;
2130   default:
2131     ShouldNotReachHere();
2132   }
2133 
2134   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2135 #if COMPILER2_OR_JVMCI
2136   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2137     return generate_disjoint_copy_avx3_masked(stub_id, entry);
2138   }
2139 #endif
2140 
2141   __ align(CodeEntryAlignment);
2142   StubCodeMark mark(this, stub_id);
2143   address start = __ pc();
2144 
2145   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2146   const Register from        = rdi;  // source array address
2147   const Register to          = rsi;  // destination array address
2148   const Register qword_count = rdx;  // elements count
2149   const Register end_from    = from; // source array end address
2150   const Register end_to      = rcx;  // destination array end address
2151   const Register saved_count = r11;
2152   // End pointers are inclusive, and if count is not zero they point
2153   // to the last unit copied:  end_to[0] := end_from[0]
2154 
2155   __ enter(); // required for proper stackwalking of RuntimeStub frame
2156   // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2157   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2158 
2159   if (entry != nullptr) {
2160     *entry = __ pc();
2161     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2162     BLOCK_COMMENT("Entry:");
2163   }
2164 
2165   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2166                                    // r9 is used to save r15_thread
2167   // 'from', 'to' and 'qword_count' are now valid
2168 
2169   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2170   if (dest_uninitialized) {
2171     decorators |= IS_DEST_UNINITIALIZED;
2172   }
2173   if (aligned) {
2174     decorators |= ARRAYCOPY_ALIGNED;
2175   }
2176 
2177   BasicType type = is_oop ? T_OBJECT : T_LONG;
2178   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2179   {
2180     // UnsafeMemoryAccess page error: continue after unsafe access
2181     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2182 
2183     // Copy from low to high addresses.  Use 'to' as scratch.
2184     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2185     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2186     __ negptr(qword_count);
2187     __ jmp(L_copy_bytes);
2188 
2189     // Copy trailing qwords
2190   __ BIND(L_copy_8_bytes);
2191     bs->copy_load_at(_masm, decorators, type, 8,
2192                      rax, Address(end_from, qword_count, Address::times_8, 8),
2193                      r10);
2194     bs->copy_store_at(_masm, decorators, type, 8,
2195                       Address(end_to, qword_count, Address::times_8, 8), rax,
2196                       r10);
2197     __ increment(qword_count);
2198     __ jcc(Assembler::notZero, L_copy_8_bytes);
2199   }
2200   if (is_oop) {
2201     __ jmp(L_exit);
2202   } else {
2203     restore_arg_regs_using_thread();
2204     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2205     __ xorptr(rax, rax); // return 0
2206     __ vzeroupper();
2207     __ leave(); // required for proper stackwalking of RuntimeStub frame
2208     __ ret(0);
2209   }
2210 
2211   {
2212     // UnsafeMemoryAccess page error: continue after unsafe access
2213     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2214     // Copy in multi-bytes chunks
2215     copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2216   }
2217 
2218   __ BIND(L_exit);
2219   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2220   restore_arg_regs_using_thread();
2221   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2222                           SharedRuntime::_jlong_array_copy_ctr,
2223                  rscratch1); // Update counter after rscratch1 is free
2224   __ vzeroupper();
2225   __ xorptr(rax, rax); // return 0
2226   __ leave(); // required for proper stackwalking of RuntimeStub frame
2227   __ ret(0);
2228 
2229   return start;
2230 }
2231 
2232 
2233 // Arguments:
2234 //   entry     - location for return of (post-push) entry
2235 //   nooverlap_target - entry to branch to if no overlap detected
2236 //   is_oop  - true => oop array, so generate store check code
2237 //
2238 // Inputs:
2239 //   c_rarg0   - source array address
2240 //   c_rarg1   - destination array address
2241 //   c_rarg2   - element count, treated as ssize_t, can be zero
2242 //
2243 address StubGenerator::generate_conjoint_long_oop_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
2244   // aligned is always false -- x86_64 always uses the unaligned code
2245   const bool aligned = false;
2246   bool is_oop;
2247   bool dest_uninitialized;
2248   switch (stub_id) {
2249   case StubGenStubId::jlong_arraycopy_id:
2250     is_oop = false;
2251     dest_uninitialized = false;
2252     break;
2253   case StubGenStubId::oop_arraycopy_id:
2254     assert(!UseCompressedOops, "inconsistent oop copy size!");
2255     is_oop = true;
2256     dest_uninitialized = false;
2257     break;
2258   case StubGenStubId::oop_arraycopy_uninit_id:
2259     assert(!UseCompressedOops, "inconsistent oop copy size!");
2260     is_oop = true;
2261     dest_uninitialized = true;
2262     break;
2263   default:
2264     ShouldNotReachHere();
2265   }
2266 
2267   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2268 #if COMPILER2_OR_JVMCI
2269   if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2270     return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2271   }
2272 #endif
2273 
2274   __ align(CodeEntryAlignment);
2275   StubCodeMark mark(this, stub_id);
2276   address start = __ pc();
2277 
2278   Label L_copy_bytes, L_copy_8_bytes, L_exit;
2279   const Register from        = rdi;  // source array address
2280   const Register to          = rsi;  // destination array address
2281   const Register qword_count = rdx;  // elements count
2282   const Register saved_count = rcx;
2283 
2284   __ enter(); // required for proper stackwalking of RuntimeStub frame
2285   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2286 
2287   if (entry != nullptr) {
2288     *entry = __ pc();
2289     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2290     BLOCK_COMMENT("Entry:");
2291   }
2292 
2293   array_overlap_test(nooverlap_target, Address::times_8);
2294   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2295                                  // r9 is used to save r15_thread
2296   // 'from', 'to' and 'qword_count' are now valid
2297 
2298   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2299   if (dest_uninitialized) {
2300     decorators |= IS_DEST_UNINITIALIZED;
2301   }
2302   if (aligned) {
2303     decorators |= ARRAYCOPY_ALIGNED;
2304   }
2305 
2306   BasicType type = is_oop ? T_OBJECT : T_LONG;
2307   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2308   {
2309     // UnsafeMemoryAccess page error: continue after unsafe access
2310     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2311 
2312     __ jmp(L_copy_bytes);
2313 
2314     // Copy trailing qwords
2315   __ BIND(L_copy_8_bytes);
2316     bs->copy_load_at(_masm, decorators, type, 8,
2317                      rax, Address(from, qword_count, Address::times_8, -8),
2318                      r10);
2319     bs->copy_store_at(_masm, decorators, type, 8,
2320                       Address(to, qword_count, Address::times_8, -8), rax,
2321                       r10);
2322     __ decrement(qword_count);
2323     __ jcc(Assembler::notZero, L_copy_8_bytes);
2324   }
2325   if (is_oop) {
2326     __ jmp(L_exit);
2327   } else {
2328     restore_arg_regs_using_thread();
2329     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2330     __ xorptr(rax, rax); // return 0
2331     __ vzeroupper();
2332     __ leave(); // required for proper stackwalking of RuntimeStub frame
2333     __ ret(0);
2334   }
2335   {
2336     // UnsafeMemoryAccess page error: continue after unsafe access
2337     UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2338 
2339     // Copy in multi-bytes chunks
2340     copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2341   }
2342   __ BIND(L_exit);
2343   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2344   restore_arg_regs_using_thread();
2345   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2346                           SharedRuntime::_jlong_array_copy_ctr,
2347                  rscratch1); // Update counter after rscratch1 is free
2348   __ vzeroupper();
2349   __ xorptr(rax, rax); // return 0
2350   __ leave(); // required for proper stackwalking of RuntimeStub frame
2351   __ ret(0);
2352 
2353   return start;
2354 }
2355 
2356 
2357 // Helper for generating a dynamic type check.
2358 // Smashes no registers.
2359 void StubGenerator::generate_type_check(Register sub_klass,
2360                                         Register super_check_offset,
2361                                         Register super_klass,
2362                                         Label& L_success) {
2363   assert_different_registers(sub_klass, super_check_offset, super_klass);
2364 
2365   BLOCK_COMMENT("type_check:");
2366 
2367   Label L_miss;
2368 
2369   __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
2370                                    super_check_offset);
2371   __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2372 
2373   // Fall through on failure!
2374   __ BIND(L_miss);
2375 }
2376 
2377 //
2378 //  Generate checkcasting array copy stub
2379 //
2380 //  Input:
2381 //    c_rarg0   - source array address
2382 //    c_rarg1   - destination array address
2383 //    c_rarg2   - element count, treated as ssize_t, can be zero
2384 //    c_rarg3   - size_t ckoff (super_check_offset)
2385 // not Win64
2386 //    c_rarg4   - oop ckval (super_klass)
2387 // Win64
2388 //    rsp+40    - oop ckval (super_klass)
2389 //
2390 //  Output:
2391 //    rax ==  0  -  success
2392 //    rax == -1^K - failure, where K is partial transfer count
2393 //
2394 address StubGenerator::generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
2395 
2396   bool dest_uninitialized;
2397   switch (stub_id) {
2398   case StubGenStubId::checkcast_arraycopy_id:
2399     dest_uninitialized = false;
2400     break;
2401   case StubGenStubId::checkcast_arraycopy_uninit_id:
2402     dest_uninitialized = true;
2403     break;
2404   default:
2405     ShouldNotReachHere();
2406   }
2407 
2408   Label L_load_element, L_store_element, L_do_card_marks, L_done;
2409 
2410   // Input registers (after setup_arg_regs)
2411   const Register from        = rdi;   // source array address
2412   const Register to          = rsi;   // destination array address
2413   const Register length      = rdx;   // elements count
2414   const Register ckoff       = rcx;   // super_check_offset
2415   const Register ckval       = r8;    // super_klass
2416 
2417   // Registers used as temps (r13, r14 are save-on-entry)
2418   const Register end_from    = from;  // source array end address
2419   const Register end_to      = r13;   // destination array end address
2420   const Register count       = rdx;   // -(count_remaining)
2421   const Register r14_length  = r14;   // saved copy of length
2422   // End pointers are inclusive, and if length is not zero they point
2423   // to the last unit copied:  end_to[0] := end_from[0]
2424 
2425   const Register rax_oop    = rax;    // actual oop copied
2426   const Register r11_klass  = r11;    // oop._klass
2427 
2428   //---------------------------------------------------------------
2429   // Assembler stub will be used for this call to arraycopy
2430   // if the two arrays are subtypes of Object[] but the
2431   // destination array type is not equal to or a supertype
2432   // of the source type.  Each element must be separately
2433   // checked.
2434 
2435   __ align(CodeEntryAlignment);
2436   StubCodeMark mark(this, stub_id);
2437   address start = __ pc();
2438 
2439   __ enter(); // required for proper stackwalking of RuntimeStub frame
2440 
2441 #ifdef ASSERT
2442   // caller guarantees that the arrays really are different
2443   // otherwise, we would have to make conjoint checks
2444   { Label L;
2445     array_overlap_test(L, TIMES_OOP);
2446     __ stop("checkcast_copy within a single array");
2447     __ bind(L);
2448   }
2449 #endif //ASSERT
2450 
2451   setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2452                                   // ckoff => rcx, ckval => r8
2453                                   // r9 is used to save r15_thread
2454 #ifdef _WIN64
2455   // last argument (#4) is on stack on Win64
2456   __ movptr(ckval, Address(rsp, 6 * wordSize));
2457 #endif
2458 
2459   // Caller of this entry point must set up the argument registers.
2460   if (entry != nullptr) {
2461     *entry = __ pc();
2462     BLOCK_COMMENT("Entry:");
2463   }
2464 
2465   // allocate spill slots for r13, r14
2466   enum {
2467     saved_r13_offset,
2468     saved_r14_offset,
2469     saved_r10_offset,
2470     saved_rbp_offset
2471   };
2472   __ subptr(rsp, saved_rbp_offset * wordSize);
2473   __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2474   __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2475   __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2476 
2477 #ifdef ASSERT
2478     Label L2;
2479     __ get_thread(r14);
2480     __ cmpptr(r15_thread, r14);
2481     __ jcc(Assembler::equal, L2);
2482     __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2483     __ bind(L2);
2484 #endif // ASSERT
2485 
2486   // check that int operands are properly extended to size_t
2487   assert_clean_int(length, rax);
2488   assert_clean_int(ckoff, rax);
2489 
2490 #ifdef ASSERT
2491   BLOCK_COMMENT("assert consistent ckoff/ckval");
2492   // The ckoff and ckval must be mutually consistent,
2493   // even though caller generates both.
2494   { Label L;
2495     int sco_offset = in_bytes(Klass::super_check_offset_offset());
2496     __ cmpl(ckoff, Address(ckval, sco_offset));
2497     __ jcc(Assembler::equal, L);
2498     __ stop("super_check_offset inconsistent");
2499     __ bind(L);
2500   }
2501 #endif //ASSERT
2502 
2503   // Loop-invariant addresses.  They are exclusive end pointers.
2504   Address end_from_addr(from, length, TIMES_OOP, 0);
2505   Address   end_to_addr(to,   length, TIMES_OOP, 0);
2506   // Loop-variant addresses.  They assume post-incremented count < 0.
2507   Address from_element_addr(end_from, count, TIMES_OOP, 0);
2508   Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2509 
2510   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2511   if (dest_uninitialized) {
2512     decorators |= IS_DEST_UNINITIALIZED;
2513   }
2514 
2515   BasicType type = T_OBJECT;
2516   size_t element_size = UseCompressedOops ? 4 : 8;
2517 
2518   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2519   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2520 
2521   // Copy from low to high addresses, indexed from the end of each array.
2522   __ lea(end_from, end_from_addr);
2523   __ lea(end_to,   end_to_addr);
2524   __ movptr(r14_length, length);        // save a copy of the length
2525   assert(length == count, "");          // else fix next line:
2526   __ negptr(count);                     // negate and test the length
2527   __ jcc(Assembler::notZero, L_load_element);
2528 
2529   // Empty array:  Nothing to do.
2530   __ xorptr(rax, rax);                  // return 0 on (trivial) success
2531   __ jmp(L_done);
2532 
2533   // ======== begin loop ========
2534   // (Loop is rotated; its entry is L_load_element.)
2535   // Loop control:
2536   //   for (count = -count; count != 0; count++)
2537   // Base pointers src, dst are biased by 8*(count-1),to last element.
2538   __ align(OptoLoopAlignment);
2539 
2540   __ BIND(L_store_element);
2541   bs->copy_store_at(_masm,
2542                     decorators,
2543                     type,
2544                     element_size,
2545                     to_element_addr,
2546                     rax_oop,
2547                     r10);
2548   __ increment(count);               // increment the count toward zero
2549   __ jcc(Assembler::zero, L_do_card_marks);
2550 
2551   // ======== loop entry is here ========
2552   __ BIND(L_load_element);
2553   bs->copy_load_at(_masm,
2554                    decorators,
2555                    type,
2556                    element_size,
2557                    rax_oop,
2558                    from_element_addr,
2559                    r10);
2560   __ testptr(rax_oop, rax_oop);
2561   __ jcc(Assembler::zero, L_store_element);
2562 
2563   __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2564   generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2565   // ======== end loop ========
2566 
2567   // It was a real error; we must depend on the caller to finish the job.
2568   // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2569   // Emit GC store barriers for the oops we have copied (r14 + rdx),
2570   // and report their number to the caller.
2571   assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2572   Label L_post_barrier;
2573   __ addptr(r14_length, count);     // K = (original - remaining) oops
2574   __ movptr(rax, r14_length);       // save the value
2575   __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2576   __ jccb(Assembler::notZero, L_post_barrier);
2577   __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2578 
2579   // Come here on success only.
2580   __ BIND(L_do_card_marks);
2581   __ xorptr(rax, rax);              // return 0 on success
2582 
2583   __ BIND(L_post_barrier);
2584   bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2585 
2586   // Common exit point (success or failure).
2587   __ BIND(L_done);
2588   __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2589   __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2590   __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2591   restore_arg_regs_using_thread();
2592   INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2593   __ leave(); // required for proper stackwalking of RuntimeStub frame
2594   __ ret(0);
2595 
2596   return start;
2597 }
2598 
2599 
2600 //  Generate 'unsafe' array copy stub
2601 //  Though just as safe as the other stubs, it takes an unscaled
2602 //  size_t argument instead of an element count.
2603 //
2604 //  Input:
2605 //    c_rarg0   - source array address
2606 //    c_rarg1   - destination array address
2607 //    c_rarg2   - byte count, treated as ssize_t, can be zero
2608 //
2609 // Examines the alignment of the operands and dispatches
2610 // to a long, int, short, or byte copy loop.
2611 //
2612 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
2613                                             address int_copy_entry, address long_copy_entry) {
2614 
2615   Label L_long_aligned, L_int_aligned, L_short_aligned;
2616 
2617   // Input registers (before setup_arg_regs)
2618   const Register from        = c_rarg0;  // source array address
2619   const Register to          = c_rarg1;  // destination array address
2620   const Register size        = c_rarg2;  // byte count (size_t)
2621 
2622   // Register used as a temp
2623   const Register bits        = rax;      // test copy of low bits
2624 
2625   __ align(CodeEntryAlignment);
2626   StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
2627   StubCodeMark mark(this, stub_id);
2628   address start = __ pc();
2629 
2630   __ enter(); // required for proper stackwalking of RuntimeStub frame
2631 
2632   // bump this on entry, not on exit:
2633   INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
2634 
2635   __ mov(bits, from);
2636   __ orptr(bits, to);
2637   __ orptr(bits, size);
2638 
2639   __ testb(bits, BytesPerLong-1);
2640   __ jccb(Assembler::zero, L_long_aligned);
2641 
2642   __ testb(bits, BytesPerInt-1);
2643   __ jccb(Assembler::zero, L_int_aligned);
2644 
2645   __ testb(bits, BytesPerShort-1);
2646   __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2647 
2648   __ BIND(L_short_aligned);
2649   __ shrptr(size, LogBytesPerShort); // size => short_count
2650   __ jump(RuntimeAddress(short_copy_entry));
2651 
2652   __ BIND(L_int_aligned);
2653   __ shrptr(size, LogBytesPerInt); // size => int_count
2654   __ jump(RuntimeAddress(int_copy_entry));
2655 
2656   __ BIND(L_long_aligned);
2657   __ shrptr(size, LogBytesPerLong); // size => qword_count
2658   __ jump(RuntimeAddress(long_copy_entry));
2659 
2660   return start;
2661 }
2662 
2663 
2664 // Static enum for helper
2665 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
2666 // Helper for generate_unsafe_setmemory
2667 //
2668 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
2669 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
2670                                      Register size, Register wide_value,
2671                                      Register tmp, Label& L_exit,
2672                                      MacroAssembler *_masm) {
2673   Label L_Loop, L_Tail, L_TailLoop;
2674 
2675   int shiftval = 0;
2676   int incr = 0;
2677 
2678   switch (type) {
2679     case USM_SHORT:
2680       shiftval = 1;
2681       incr = 16;
2682       break;
2683     case USM_DWORD:
2684       shiftval = 2;
2685       incr = 32;
2686       break;
2687     case USM_QUADWORD:
2688       shiftval = 3;
2689       incr = 64;
2690       break;
2691   }
2692 
2693   // At this point, we know the lower bits of size are zero
2694   __ shrq(size, shiftval);
2695   // size now has number of X-byte chunks (2, 4 or 8)
2696 
2697   // Number of (8*X)-byte chunks into tmp
2698   __ movq(tmp, size);
2699   __ shrq(tmp, 3);
2700   __ jccb(Assembler::zero, L_Tail);
2701 
2702   __ BIND(L_Loop);
2703 
2704   // Unroll 8 stores
2705   for (int i = 0; i < 8; i++) {
2706     switch (type) {
2707       case USM_SHORT:
2708         __ movw(Address(dest, (2 * i)), wide_value);
2709         break;
2710       case USM_DWORD:
2711         __ movl(Address(dest, (4 * i)), wide_value);
2712         break;
2713       case USM_QUADWORD:
2714         __ movq(Address(dest, (8 * i)), wide_value);
2715         break;
2716     }
2717   }
2718   __ addq(dest, incr);
2719   __ decrementq(tmp);
2720   __ jccb(Assembler::notZero, L_Loop);
2721 
2722   __ BIND(L_Tail);
2723 
2724   // Find number of remaining X-byte chunks
2725   __ andq(size, 0x7);
2726 
2727   // If zero, then we're done
2728   __ jccb(Assembler::zero, L_exit);
2729 
2730   __ BIND(L_TailLoop);
2731 
2732     switch (type) {
2733       case USM_SHORT:
2734         __ movw(Address(dest, 0), wide_value);
2735         break;
2736       case USM_DWORD:
2737         __ movl(Address(dest, 0), wide_value);
2738         break;
2739       case USM_QUADWORD:
2740         __ movq(Address(dest, 0), wide_value);
2741         break;
2742     }
2743   __ addq(dest, incr >> 3);
2744   __ decrementq(size);
2745   __ jccb(Assembler::notZero, L_TailLoop);
2746 }
2747 
2748 //  Generate 'unsafe' set memory stub
2749 //  Though just as safe as the other stubs, it takes an unscaled
2750 //  size_t (# bytes) argument instead of an element count.
2751 //
2752 //  Input:
2753 //    c_rarg0   - destination array address
2754 //    c_rarg1   - byte count (size_t)
2755 //    c_rarg2   - byte value
2756 //
2757 // Examines the alignment of the operands and dispatches
2758 // to an int, short, or byte fill loop.
2759 //
2760 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) {
2761   __ align(CodeEntryAlignment);
2762   StubGenStubId stub_id = StubGenStubId::unsafe_setmemory_id;
2763   StubCodeMark mark(this, stub_id);
2764   address start = __ pc();
2765   __ enter();   // required for proper stackwalking of RuntimeStub frame
2766 
2767   assert(unsafe_byte_fill != nullptr, "Invalid call");
2768 
2769   // bump this on entry, not on exit:
2770   INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
2771 
2772   {
2773     Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
2774 
2775     const Register dest = c_rarg0;
2776     const Register size = c_rarg1;
2777     const Register byteVal = c_rarg2;
2778     const Register wide_value = rax;
2779     const Register rScratch1 = r10;
2780 
2781     assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
2782 
2783     //     fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2784 
2785     __ testq(size, size);
2786     __ jcc(Assembler::zero, L_exit);
2787 
2788     // Propagate byte to full Register
2789     __ movzbl(rScratch1, byteVal);
2790     __ mov64(wide_value, 0x0101010101010101ULL);
2791     __ imulq(wide_value, rScratch1);
2792 
2793     // Check for pointer & size alignment
2794     __ movq(rScratch1, dest);
2795     __ orq(rScratch1, size);
2796 
2797     __ testb(rScratch1, 7);
2798     __ jcc(Assembler::equal, L_fillQuadwords);
2799 
2800     __ testb(rScratch1, 3);
2801     __ jcc(Assembler::equal, L_fillDwords);
2802 
2803     __ testb(rScratch1, 1);
2804     __ jcc(Assembler::notEqual, L_fillBytes);
2805 
2806     // Fill words
2807     {
2808       UnsafeMemoryAccessMark umam(this, true, true);
2809 
2810       // At this point, we know the lower bit of size is zero and a
2811       // multiple of 2
2812       do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
2813                                L_exit, _masm);
2814     }
2815     __ jmpb(L_exit);
2816 
2817     __ BIND(L_fillQuadwords);
2818 
2819     // Fill QUADWORDs
2820     {
2821       UnsafeMemoryAccessMark umam(this, true, true);
2822 
2823       // At this point, we know the lower 3 bits of size are zero and a
2824       // multiple of 8
2825       do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
2826                                L_exit, _masm);
2827     }
2828     __ BIND(L_exit);
2829 
2830     __ leave();   // required for proper stackwalking of RuntimeStub frame
2831     __ ret(0);
2832 
2833     __ BIND(L_fillDwords);
2834 
2835     // Fill DWORDs
2836     {
2837       UnsafeMemoryAccessMark umam(this, true, true);
2838 
2839       // At this point, we know the lower 2 bits of size are zero and a
2840       // multiple of 4
2841       do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
2842                                L_exit, _masm);
2843     }
2844     __ jmpb(L_exit);
2845 
2846     __ BIND(L_fillBytes);
2847     // Set up for tail call to previously generated byte fill routine
2848     // Parameter order is (ptr, byteVal, size)
2849     __ xchgq(c_rarg1, c_rarg2);
2850     __ leave();    // Clear effect of enter()
2851     __ jump(RuntimeAddress(unsafe_byte_fill));
2852   }
2853 
2854   return start;
2855 }
2856 
2857 // Perform range checks on the proposed arraycopy.
2858 // Kills temp, but nothing else.
2859 // Also, clean the sign bits of src_pos and dst_pos.
2860 void StubGenerator::arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2861                                            Register src_pos, // source position (c_rarg1)
2862                                            Register dst,     // destination array oo (c_rarg2)
2863                                            Register dst_pos, // destination position (c_rarg3)
2864                                            Register length,
2865                                            Register temp,
2866                                            Label& L_failed) {
2867   BLOCK_COMMENT("arraycopy_range_checks:");
2868 
2869   //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2870   __ movl(temp, length);
2871   __ addl(temp, src_pos);             // src_pos + length
2872   __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2873   __ jcc(Assembler::above, L_failed);
2874 
2875   //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2876   __ movl(temp, length);
2877   __ addl(temp, dst_pos);             // dst_pos + length
2878   __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2879   __ jcc(Assembler::above, L_failed);
2880 
2881   // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2882   // Move with sign extension can be used since they are positive.
2883   __ movslq(src_pos, src_pos);
2884   __ movslq(dst_pos, dst_pos);
2885 
2886   BLOCK_COMMENT("arraycopy_range_checks done");
2887 }
2888 
2889 
2890 //  Generate generic array copy stubs
2891 //
2892 //  Input:
2893 //    c_rarg0    -  src oop
2894 //    c_rarg1    -  src_pos (32-bits)
2895 //    c_rarg2    -  dst oop
2896 //    c_rarg3    -  dst_pos (32-bits)
2897 // not Win64
2898 //    c_rarg4    -  element count (32-bits)
2899 // Win64
2900 //    rsp+40     -  element count (32-bits)
2901 //
2902 //  Output:
2903 //    rax ==  0  -  success
2904 //    rax == -1^K - failure, where K is partial transfer count
2905 //
2906 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2907                                              address int_copy_entry, address oop_copy_entry,
2908                                              address long_copy_entry, address checkcast_copy_entry) {
2909 
2910   Label L_failed, L_failed_0, L_objArray;
2911   Label L_copy_shorts, L_copy_ints, L_copy_longs;
2912 
2913   // Input registers
2914   const Register src        = c_rarg0;  // source array oop
2915   const Register src_pos    = c_rarg1;  // source position
2916   const Register dst        = c_rarg2;  // destination array oop
2917   const Register dst_pos    = c_rarg3;  // destination position
2918 #ifndef _WIN64
2919   const Register length     = c_rarg4;
2920   const Register rklass_tmp = r9;  // load_klass
2921 #else
2922   const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
2923   const Register rklass_tmp = rdi;  // load_klass
2924 #endif
2925 
2926   { int modulus = CodeEntryAlignment;
2927     int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2928     int advance = target - (__ offset() % modulus);
2929     if (advance < 0)  advance += modulus;
2930     if (advance > 0)  __ nop(advance);
2931   }
2932   StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
2933   StubCodeMark mark(this, stub_id);
2934 
2935   // Short-hop target to L_failed.  Makes for denser prologue code.
2936   __ BIND(L_failed_0);
2937   __ jmp(L_failed);
2938   assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2939 
2940   __ align(CodeEntryAlignment);
2941   address start = __ pc();
2942 
2943   __ enter(); // required for proper stackwalking of RuntimeStub frame
2944 
2945 #ifdef _WIN64
2946   __ push(rklass_tmp); // rdi is callee-save on Windows
2947 #endif
2948 
2949   // bump this on entry, not on exit:
2950   INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
2951 
2952   //-----------------------------------------------------------------------
2953   // Assembler stub will be used for this call to arraycopy
2954   // if the following conditions are met:
2955   //
2956   // (1) src and dst must not be null.
2957   // (2) src_pos must not be negative.
2958   // (3) dst_pos must not be negative.
2959   // (4) length  must not be negative.
2960   // (5) src klass and dst klass should be the same and not null.
2961   // (6) src and dst should be arrays.
2962   // (7) src_pos + length must not exceed length of src.
2963   // (8) dst_pos + length must not exceed length of dst.
2964   //
2965 
2966   //  if (src == nullptr) return -1;
2967   __ testptr(src, src);         // src oop
2968   size_t j1off = __ offset();
2969   __ jccb(Assembler::zero, L_failed_0);
2970 
2971   //  if (src_pos < 0) return -1;
2972   __ testl(src_pos, src_pos); // src_pos (32-bits)
2973   __ jccb(Assembler::negative, L_failed_0);
2974 
2975   //  if (dst == nullptr) return -1;
2976   __ testptr(dst, dst);         // dst oop
2977   __ jccb(Assembler::zero, L_failed_0);
2978 
2979   //  if (dst_pos < 0) return -1;
2980   __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2981   size_t j4off = __ offset();
2982   __ jccb(Assembler::negative, L_failed_0);
2983 
2984   // The first four tests are very dense code,
2985   // but not quite dense enough to put four
2986   // jumps in a 16-byte instruction fetch buffer.
2987   // That's good, because some branch predicters
2988   // do not like jumps so close together.
2989   // Make sure of this.
2990   guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2991 
2992   // registers used as temp
2993   const Register r11_length    = r11; // elements count to copy
2994   const Register r10_src_klass = r10; // array klass
2995 
2996   //  if (length < 0) return -1;
2997   __ movl(r11_length, length);        // length (elements count, 32-bits value)
2998   __ testl(r11_length, r11_length);
2999   __ jccb(Assembler::negative, L_failed_0);
3000 
3001   __ load_klass(r10_src_klass, src, rklass_tmp);
3002 #ifdef ASSERT
3003   //  assert(src->klass() != nullptr);
3004   {
3005     BLOCK_COMMENT("assert klasses not null {");
3006     Label L1, L2;
3007     __ testptr(r10_src_klass, r10_src_klass);
3008     __ jcc(Assembler::notZero, L2);   // it is broken if klass is null
3009     __ bind(L1);
3010     __ stop("broken null klass");
3011     __ bind(L2);
3012     __ load_klass(rax, dst, rklass_tmp);
3013     __ cmpq(rax, 0);
3014     __ jcc(Assembler::equal, L1);     // this would be broken also
3015     BLOCK_COMMENT("} assert klasses not null done");
3016   }
3017 #endif
3018 
3019   // Load layout helper (32-bits)
3020   //
3021   //  |array_tag|     | header_size | element_type |     |log2_element_size|
3022   // 32        30    24            16              8     2                 0
3023   //
3024   //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3025   //
3026 
3027   const int lh_offset = in_bytes(Klass::layout_helper_offset());
3028 
3029   // Handle objArrays completely differently...
3030   const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3031   __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3032   __ jcc(Assembler::equal, L_objArray);
3033 
3034   //  if (src->klass() != dst->klass()) return -1;
3035   __ load_klass(rax, dst, rklass_tmp);
3036   __ cmpq(r10_src_klass, rax);
3037   __ jcc(Assembler::notEqual, L_failed);
3038 
3039   // Check for flat inline type array -> return -1
3040   __ test_flat_array_oop(src, rax, L_failed);
3041 
3042   // Check for null-free (non-flat) inline type array -> handle as object array
3043   __ test_null_free_array_oop(src, rax, L_objArray);
3044 
3045   const Register rax_lh = rax;  // layout helper
3046   __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3047 
3048   // Check for flat inline type array -> return -1
3049   __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace);
3050   __ jcc(Assembler::notZero, L_failed);
3051 
3052   //  if (!src->is_Array()) return -1;
3053   __ cmpl(rax_lh, Klass::_lh_neutral_value);
3054   __ jcc(Assembler::greaterEqual, L_failed);
3055 
3056   // At this point, it is known to be a typeArray (array_tag 0x3).
3057 #ifdef ASSERT
3058   {
3059     BLOCK_COMMENT("assert primitive array {");
3060     Label L;
3061     __ movl(rklass_tmp, rax_lh);
3062     __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
3063     __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
3064     __ jcc(Assembler::equal, L);
3065     __ stop("must be a primitive array");
3066     __ bind(L);
3067     BLOCK_COMMENT("} assert primitive array done");
3068   }
3069 #endif
3070 
3071   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3072                          r10, L_failed);
3073 
3074   // TypeArrayKlass
3075   //
3076   // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3077   // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3078   //
3079 
3080   const Register r10_offset = r10;    // array offset
3081   const Register rax_elsize = rax_lh; // element size
3082 
3083   __ movl(r10_offset, rax_lh);
3084   __ shrl(r10_offset, Klass::_lh_header_size_shift);
3085   __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3086   __ addptr(src, r10_offset);           // src array offset
3087   __ addptr(dst, r10_offset);           // dst array offset
3088   BLOCK_COMMENT("choose copy loop based on element size");
3089   __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3090 
3091 #ifdef _WIN64
3092   __ pop(rklass_tmp); // Restore callee-save rdi
3093 #endif
3094 
3095   // next registers should be set before the jump to corresponding stub
3096   const Register from     = c_rarg0;  // source array address
3097   const Register to       = c_rarg1;  // destination array address
3098   const Register count    = c_rarg2;  // elements count
3099 
3100   // 'from', 'to', 'count' registers should be set in such order
3101   // since they are the same as 'src', 'src_pos', 'dst'.
3102 
3103   __ cmpl(rax_elsize, 0);
3104   __ jccb(Assembler::notEqual, L_copy_shorts);
3105   __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3106   __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3107   __ movl2ptr(count, r11_length); // length
3108   __ jump(RuntimeAddress(byte_copy_entry));
3109 
3110 __ BIND(L_copy_shorts);
3111   __ cmpl(rax_elsize, LogBytesPerShort);
3112   __ jccb(Assembler::notEqual, L_copy_ints);
3113   __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3114   __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3115   __ movl2ptr(count, r11_length); // length
3116   __ jump(RuntimeAddress(short_copy_entry));
3117 
3118 __ BIND(L_copy_ints);
3119   __ cmpl(rax_elsize, LogBytesPerInt);
3120   __ jccb(Assembler::notEqual, L_copy_longs);
3121   __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3122   __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3123   __ movl2ptr(count, r11_length); // length
3124   __ jump(RuntimeAddress(int_copy_entry));
3125 
3126 __ BIND(L_copy_longs);
3127 #ifdef ASSERT
3128   {
3129     BLOCK_COMMENT("assert long copy {");
3130     Label L;
3131     __ cmpl(rax_elsize, LogBytesPerLong);
3132     __ jcc(Assembler::equal, L);
3133     __ stop("must be long copy, but elsize is wrong");
3134     __ bind(L);
3135     BLOCK_COMMENT("} assert long copy done");
3136   }
3137 #endif
3138   __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3139   __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3140   __ movl2ptr(count, r11_length); // length
3141   __ jump(RuntimeAddress(long_copy_entry));
3142 
3143   // ObjArrayKlass
3144 __ BIND(L_objArray);
3145   // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3146 
3147   Label L_plain_copy, L_checkcast_copy;
3148   //  test array classes for subtyping
3149   __ load_klass(rax, dst, rklass_tmp);
3150   __ cmpq(r10_src_klass, rax); // usual case is exact equality
3151   __ jcc(Assembler::notEqual, L_checkcast_copy);
3152 
3153   // Identically typed arrays can be copied without element-wise checks.
3154   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3155                          r10, L_failed);
3156 
3157   __ lea(from, Address(src, src_pos, TIMES_OOP,
3158                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3159   __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3160                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3161   __ movl2ptr(count, r11_length); // length
3162 __ BIND(L_plain_copy);
3163 #ifdef _WIN64
3164   __ pop(rklass_tmp); // Restore callee-save rdi
3165 #endif
3166   __ jump(RuntimeAddress(oop_copy_entry));
3167 
3168 __ BIND(L_checkcast_copy);
3169   // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3170   {
3171     // Before looking at dst.length, make sure dst is also an objArray.
3172     // This check also fails for flat arrays which are not supported.
3173     __ cmpl(Address(rax, lh_offset), objArray_lh);
3174     __ jcc(Assembler::notEqual, L_failed);
3175 
3176 #ifdef ASSERT
3177     {
3178       BLOCK_COMMENT("assert not null-free array {");
3179       Label L;
3180       __ test_non_null_free_array_oop(dst, rklass_tmp, L);
3181       __ stop("unexpected null-free array");
3182       __ bind(L);
3183       BLOCK_COMMENT("} assert not null-free array");
3184     }
3185 #endif
3186 
3187     // It is safe to examine both src.length and dst.length.
3188     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3189                            rax, L_failed);
3190 
3191     const Register r11_dst_klass = r11;
3192     __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3193 
3194     // Marshal the base address arguments now, freeing registers.
3195     __ lea(from, Address(src, src_pos, TIMES_OOP,
3196                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3197     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3198                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3199     __ movl(count, length);           // length (reloaded)
3200     Register sco_temp = c_rarg3;      // this register is free now
3201     assert_different_registers(from, to, count, sco_temp,
3202                                r11_dst_klass, r10_src_klass);
3203     assert_clean_int(count, sco_temp);
3204 
3205     // Generate the type check.
3206     const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3207     __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3208     assert_clean_int(sco_temp, rax);
3209     generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3210 
3211     // Fetch destination element klass from the ObjArrayKlass header.
3212     int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3213     __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3214     __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3215     assert_clean_int(sco_temp, rax);
3216 
3217 #ifdef _WIN64
3218     __ pop(rklass_tmp); // Restore callee-save rdi
3219 #endif
3220 
3221     // the checkcast_copy loop needs two extra arguments:
3222     assert(c_rarg3 == sco_temp, "#3 already in place");
3223     // Set up arguments for checkcast_copy_entry.
3224     setup_arg_regs_using_thread(4);
3225     __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3226     __ jump(RuntimeAddress(checkcast_copy_entry));
3227   }
3228 
3229 __ BIND(L_failed);
3230 #ifdef _WIN64
3231   __ pop(rklass_tmp); // Restore callee-save rdi
3232 #endif
3233   __ xorptr(rax, rax);
3234   __ notptr(rax); // return -1
3235   __ leave();   // required for proper stackwalking of RuntimeStub frame
3236   __ ret(0);
3237 
3238   return start;
3239 }
3240 
3241 #undef __