1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/objArrayKlass.hpp"
  30 #include "runtime/sharedRuntime.hpp"
  31 #include "runtime/stubRoutines.hpp"
  32 #include "stubGenerator_x86_64.hpp"
  33 #ifdef COMPILER2
  34 #include "opto/c2_globals.hpp"
  35 #endif
  36 #if INCLUDE_JVMCI
  37 #include "jvmci/jvmci_globals.hpp"
  38 #endif
  39 
  40 #define __ _masm->
  41 
  42 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #else
  47 #define BLOCK_COMMENT(str) __ block_comment(str)
  48 #endif // PRODUCT
  49 
  50 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  51 
  52 #ifdef PRODUCT
  53 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
  54 #else
  55 #define INC_COUNTER_NP(counter, rscratch) \
  56 BLOCK_COMMENT("inc_counter " #counter); \
  57 inc_counter_np(_masm, counter, rscratch);
  58 
  59 static void inc_counter_np(MacroAssembler* _masm, int& counter, Register rscratch) {
  60   __ incrementl(ExternalAddress((address)&counter), rscratch);
  61 }
  62 
  63 #if COMPILER2_OR_JVMCI
  64 static int& get_profile_ctr(int shift) {
  65   if (shift == 0) {
  66     return SharedRuntime::_jbyte_array_copy_ctr;
  67   } else if (shift == 1) {
  68     return SharedRuntime::_jshort_array_copy_ctr;
  69   } else if (shift == 2) {
  70     return SharedRuntime::_jint_array_copy_ctr;
  71   } else {
  72     assert(shift == 3, "");
  73     return SharedRuntime::_jlong_array_copy_ctr;
  74   }
  75 }
  76 #endif // COMPILER2_OR_JVMCI
  77 #endif // !PRODUCT
  78 
  79 void StubGenerator::generate_arraycopy_stubs() {
  80   address entry;
  81   address entry_jbyte_arraycopy;
  82   address entry_jshort_arraycopy;
  83   address entry_jint_arraycopy;
  84   address entry_oop_arraycopy;
  85   address entry_jlong_arraycopy;
  86   address entry_checkcast_arraycopy;
  87 
  88   StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
  89                                                                          "jbyte_disjoint_arraycopy");
  90   StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
  91                                                                          "jbyte_arraycopy");
  92 
  93   StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
  94                                                                           "jshort_disjoint_arraycopy");
  95   StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
  96                                                                           "jshort_arraycopy");
  97 
  98   StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
  99                                                                             "jint_disjoint_arraycopy");
 100   StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
 101                                                                             &entry_jint_arraycopy, "jint_arraycopy");
 102 
 103   StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
 104                                                                              "jlong_disjoint_arraycopy");
 105   StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
 106                                                                              &entry_jlong_arraycopy, "jlong_arraycopy");
 107   if (UseCompressedOops) {
 108     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
 109                                                                             "oop_disjoint_arraycopy");
 110     StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
 111                                                                             &entry_oop_arraycopy, "oop_arraycopy");
 112     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
 113                                                                                    "oop_disjoint_arraycopy_uninit",
 114                                                                                    /*dest_uninitialized*/true);
 115     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
 116                                                                                    NULL, "oop_arraycopy_uninit",
 117                                                                                    /*dest_uninitialized*/true);
 118   } else {
 119     StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
 120                                                                              "oop_disjoint_arraycopy");
 121     StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
 122                                                                              &entry_oop_arraycopy, "oop_arraycopy");
 123     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
 124                                                                                     "oop_disjoint_arraycopy_uninit",
 125                                                                                     /*dest_uninitialized*/true);
 126     StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
 127                                                                                     NULL, "oop_arraycopy_uninit",
 128                                                                                     /*dest_uninitialized*/true);
 129   }
 130 
 131   StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
 132   StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
 133                                                                       /*dest_uninitialized*/true);
 134 
 135   StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
 136                                                             entry_jbyte_arraycopy,
 137                                                             entry_jshort_arraycopy,
 138                                                             entry_jint_arraycopy,
 139                                                             entry_jlong_arraycopy);
 140   StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
 141                                                              entry_jbyte_arraycopy,
 142                                                              entry_jshort_arraycopy,
 143                                                              entry_jint_arraycopy,
 144                                                              entry_oop_arraycopy,
 145                                                              entry_jlong_arraycopy,
 146                                                              entry_checkcast_arraycopy);
 147 
 148   StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
 149   StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
 150   StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
 151   StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
 152   StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
 153   StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
 154 
 155   // We don't generate specialized code for HeapWord-aligned source
 156   // arrays, so just use the code we've already generated
 157   StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
 158   StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
 159 
 160   StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
 161   StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
 162 
 163   StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
 164   StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
 165 
 166   StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
 167   StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
 168 
 169   StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
 170   StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
 171 
 172   StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
 173   StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
 174 }
 175 
 176 
 177 // Verify that a register contains clean 32-bits positive value
 178 // (high 32-bits are 0) so it could be used in 64-bits shifts.
 179 //
 180 //  Input:
 181 //    Rint  -  32-bits value
 182 //    Rtmp  -  scratch
 183 //
 184 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
 185 #ifdef ASSERT
 186   Label L;
 187   assert_different_registers(Rtmp, Rint);
 188   __ movslq(Rtmp, Rint);
 189   __ cmpq(Rtmp, Rint);
 190   __ jcc(Assembler::equal, L);
 191   __ stop("high 32-bits of int value are not 0");
 192   __ bind(L);
 193 #endif
 194 }
 195 
 196 
 197 //  Generate overlap test for array copy stubs
 198 //
 199 //  Input:
 200 //     c_rarg0 - from
 201 //     c_rarg1 - to
 202 //     c_rarg2 - element count
 203 //
 204 //  Output:
 205 //     rax   - &from[element count - 1]
 206 //
 207 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
 208   const Register from     = c_rarg0;
 209   const Register to       = c_rarg1;
 210   const Register count    = c_rarg2;
 211   const Register end_from = rax;
 212 
 213   __ cmpptr(to, from);
 214   __ lea(end_from, Address(from, count, sf, 0));
 215   if (NOLp == NULL) {
 216     ExternalAddress no_overlap(no_overlap_target);
 217     __ jump_cc(Assembler::belowEqual, no_overlap);
 218     __ cmpptr(to, end_from);
 219     __ jump_cc(Assembler::aboveEqual, no_overlap);
 220   } else {
 221     __ jcc(Assembler::belowEqual, (*NOLp));
 222     __ cmpptr(to, end_from);
 223     __ jcc(Assembler::aboveEqual, (*NOLp));
 224   }
 225 }
 226 
 227 
 228 // Copy big chunks forward
 229 //
 230 // Inputs:
 231 //   end_from     - source arrays end address
 232 //   end_to       - destination array end address
 233 //   qword_count  - 64-bits element count, negative
 234 //   to           - scratch
 235 //   L_copy_bytes - entry label
 236 //   L_copy_8_bytes  - exit  label
 237 //
 238 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
 239                                        Register qword_count, Register to,
 240                                        Label& L_copy_bytes, Label& L_copy_8_bytes) {
 241   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 242   Label L_loop;
 243   __ align(OptoLoopAlignment);
 244   if (UseUnalignedLoadStores) {
 245     Label L_end;
 246     __ BIND(L_loop);
 247     if (UseAVX >= 2) {
 248       __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
 249       __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
 250       __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
 251       __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
 252     } else {
 253       __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
 254       __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
 255       __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
 256       __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
 257       __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
 258       __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
 259       __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
 260       __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
 261     }
 262 
 263     __ BIND(L_copy_bytes);
 264     __ addptr(qword_count, 8);
 265     __ jcc(Assembler::lessEqual, L_loop);
 266     __ subptr(qword_count, 4);  // sub(8) and add(4)
 267     __ jccb(Assembler::greater, L_end);
 268     // Copy trailing 32 bytes
 269     if (UseAVX >= 2) {
 270       __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
 271       __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
 272     } else {
 273       __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
 274       __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
 275       __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
 276       __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
 277     }
 278     __ addptr(qword_count, 4);
 279     __ BIND(L_end);
 280   } else {
 281     // Copy 32-bytes per iteration
 282     __ BIND(L_loop);
 283     __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
 284     __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
 285     __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
 286     __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
 287     __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
 288     __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
 289     __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
 290     __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
 291 
 292     __ BIND(L_copy_bytes);
 293     __ addptr(qword_count, 4);
 294     __ jcc(Assembler::lessEqual, L_loop);
 295   }
 296   __ subptr(qword_count, 4);
 297   __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
 298 }
 299 
 300 
 301 // Copy big chunks backward
 302 //
 303 // Inputs:
 304 //   from         - source arrays address
 305 //   dest         - destination array address
 306 //   qword_count  - 64-bits element count
 307 //   to           - scratch
 308 //   L_copy_bytes - entry label
 309 //   L_copy_8_bytes  - exit  label
 310 //
 311 void StubGenerator::copy_bytes_backward(Register from, Register dest,
 312                                         Register qword_count, Register to,
 313                                         Label& L_copy_bytes, Label& L_copy_8_bytes) {
 314   DEBUG_ONLY(__ stop("enter at entry label, not here"));
 315   Label L_loop;
 316   __ align(OptoLoopAlignment);
 317   if (UseUnalignedLoadStores) {
 318     Label L_end;
 319     __ BIND(L_loop);
 320     if (UseAVX >= 2) {
 321       __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
 322       __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
 323       __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
 324       __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
 325     } else {
 326       __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
 327       __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
 328       __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
 329       __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
 330       __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
 331       __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
 332       __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
 333       __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
 334     }
 335 
 336     __ BIND(L_copy_bytes);
 337     __ subptr(qword_count, 8);
 338     __ jcc(Assembler::greaterEqual, L_loop);
 339 
 340     __ addptr(qword_count, 4);  // add(8) and sub(4)
 341     __ jccb(Assembler::less, L_end);
 342     // Copy trailing 32 bytes
 343     if (UseAVX >= 2) {
 344       __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
 345       __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
 346     } else {
 347       __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
 348       __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
 349       __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
 350       __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
 351     }
 352     __ subptr(qword_count, 4);
 353     __ BIND(L_end);
 354   } else {
 355     // Copy 32-bytes per iteration
 356     __ BIND(L_loop);
 357     __ movq(to, Address(from, qword_count, Address::times_8, 24));
 358     __ movq(Address(dest, qword_count, Address::times_8, 24), to);
 359     __ movq(to, Address(from, qword_count, Address::times_8, 16));
 360     __ movq(Address(dest, qword_count, Address::times_8, 16), to);
 361     __ movq(to, Address(from, qword_count, Address::times_8,  8));
 362     __ movq(Address(dest, qword_count, Address::times_8,  8), to);
 363     __ movq(to, Address(from, qword_count, Address::times_8,  0));
 364     __ movq(Address(dest, qword_count, Address::times_8,  0), to);
 365 
 366     __ BIND(L_copy_bytes);
 367     __ subptr(qword_count, 4);
 368     __ jcc(Assembler::greaterEqual, L_loop);
 369   }
 370   __ addptr(qword_count, 4);
 371   __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
 372 }
 373 
 374 #if COMPILER2_OR_JVMCI
 375 
 376 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
 377 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 378 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 379 //   default configuration.
 380 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 381 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 382 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 383 //   better performance for disjoint copies. For conjoint/backward copy vector based
 384 //   copy performs better.
 385 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
 386 //   64 byte vector registers (ZMMs).
 387 
 388 // Inputs:
 389 //   c_rarg0   - source array address
 390 //   c_rarg1   - destination array address
 391 //   c_rarg2   - element count, treated as ssize_t, can be zero
 392 //
 393 //
 394 // Side Effects:
 395 //   disjoint_copy_avx3_masked is set to the no-overlap entry point
 396 //   used by generate_conjoint_[byte/int/short/long]_copy().
 397 //
 398 address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const char *name,
 399                                                           int shift, bool aligned, bool is_oop,
 400                                                           bool dest_uninitialized) {
 401   __ align(CodeEntryAlignment);
 402   StubCodeMark mark(this, "StubRoutines", name);
 403   address start = __ pc();
 404 
 405   int avx3threshold = VM_Version::avx3_threshold();
 406   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 407   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 408   Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 409   const Register from        = rdi;  // source array address
 410   const Register to          = rsi;  // destination array address
 411   const Register count       = rdx;  // elements count
 412   const Register temp1       = r8;
 413   const Register temp2       = r11;
 414   const Register temp3       = rax;
 415   const Register temp4       = rcx;
 416   // End pointers are inclusive, and if count is not zero they point
 417   // to the last unit copied:  end_to[0] := end_from[0]
 418 
 419   __ enter(); // required for proper stackwalking of RuntimeStub frame
 420   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 421 
 422   if (entry != NULL) {
 423     *entry = __ pc();
 424      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 425     BLOCK_COMMENT("Entry:");
 426   }
 427 
 428   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 429   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 430 
 431   setup_argument_regs(type);
 432 
 433   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 434   if (dest_uninitialized) {
 435     decorators |= IS_DEST_UNINITIALIZED;
 436   }
 437   if (aligned) {
 438     decorators |= ARRAYCOPY_ALIGNED;
 439   }
 440   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 441   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 442 
 443   {
 444     // Type(shift)           byte(0), short(1), int(2),   long(3)
 445     int loop_size[]        = { 192,     96,       48,      24};
 446     int threshold[]        = { 4096,    2048,     1024,    512};
 447 
 448     // UnsafeCopyMemory page error: continue after ucm
 449     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
 450     // 'from', 'to' and 'count' are now valid
 451 
 452     // temp1 holds remaining count and temp4 holds running count used to compute
 453     // next address offset for start of to/from addresses (temp4 * scale).
 454     __ mov64(temp4, 0);
 455     __ movq(temp1, count);
 456 
 457     // Zero length check.
 458     __ BIND(L_tail);
 459     __ cmpq(temp1, 0);
 460     __ jcc(Assembler::lessEqual, L_exit);
 461 
 462     // Special cases using 32 byte [masked] vector copy operations.
 463     arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 464                                  temp4, temp3, use64byteVector, L_entry, L_exit);
 465 
 466     // PRE-MAIN-POST loop for aligned copy.
 467     __ BIND(L_entry);
 468 
 469     if (avx3threshold != 0) {
 470       __ cmpq(count, threshold[shift]);
 471       if (MaxVectorSize == 64) {
 472         // Copy using 64 byte vectors.
 473         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 474       } else {
 475         assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
 476         // REP MOVS offer a faster copy path.
 477         __ jcc(Assembler::greaterEqual, L_repmovs);
 478       }
 479     }
 480 
 481     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 482       // Partial copy to make dst address 32 byte aligned.
 483       __ movq(temp2, to);
 484       __ andq(temp2, 31);
 485       __ jcc(Assembler::equal, L_main_pre_loop);
 486 
 487       __ negptr(temp2);
 488       __ addq(temp2, 32);
 489       if (shift) {
 490         __ shrq(temp2, shift);
 491       }
 492       __ movq(temp3, temp2);
 493       copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
 494       __ movq(temp4, temp2);
 495       __ movq(temp1, count);
 496       __ subq(temp1, temp2);
 497 
 498       __ cmpq(temp1, loop_size[shift]);
 499       __ jcc(Assembler::less, L_tail);
 500 
 501       __ BIND(L_main_pre_loop);
 502       __ subq(temp1, loop_size[shift]);
 503 
 504       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 505       __ align32();
 506       __ BIND(L_main_loop);
 507          copy64_avx(to, from, temp4, xmm1, false, shift, 0);
 508          copy64_avx(to, from, temp4, xmm1, false, shift, 64);
 509          copy64_avx(to, from, temp4, xmm1, false, shift, 128);
 510          __ addptr(temp4, loop_size[shift]);
 511          __ subq(temp1, loop_size[shift]);
 512          __ jcc(Assembler::greater, L_main_loop);
 513 
 514       __ addq(temp1, loop_size[shift]);
 515 
 516       // Tail loop.
 517       __ jmp(L_tail);
 518 
 519       __ BIND(L_repmovs);
 520         __ movq(temp2, temp1);
 521         // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
 522         __ movq(temp3, to);
 523         __ movq(to,  from);
 524         __ movq(from, temp3);
 525         // Save to/from for restoration post rep_mov.
 526         __ movq(temp1, to);
 527         __ movq(temp3, from);
 528         if(shift < 3) {
 529           __ shrq(temp2, 3-shift);     // quad word count
 530         }
 531         __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
 532         __ rep_mov();
 533         __ shlq(temp2, 3);             // convert quad words into byte count.
 534         if(shift) {
 535           __ shrq(temp2, shift);       // type specific count.
 536         }
 537         // Restore original addresses in to/from.
 538         __ movq(to, temp3);
 539         __ movq(from, temp1);
 540         __ movq(temp4, temp2);
 541         __ movq(temp1, count);
 542         __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
 543         __ jmp(L_tail);
 544     }
 545 
 546     if (MaxVectorSize > 32) {
 547       __ BIND(L_pre_main_post_64);
 548       // Partial copy to make dst address 64 byte aligned.
 549       __ movq(temp2, to);
 550       __ andq(temp2, 63);
 551       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 552 
 553       __ negptr(temp2);
 554       __ addq(temp2, 64);
 555       if (shift) {
 556         __ shrq(temp2, shift);
 557       }
 558       __ movq(temp3, temp2);
 559       copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
 560       __ movq(temp4, temp2);
 561       __ movq(temp1, count);
 562       __ subq(temp1, temp2);
 563 
 564       __ cmpq(temp1, loop_size[shift]);
 565       __ jcc(Assembler::less, L_tail64);
 566 
 567       __ BIND(L_main_pre_loop_64bytes);
 568       __ subq(temp1, loop_size[shift]);
 569 
 570       // Main loop with aligned copy block size of 192 bytes at
 571       // 64 byte copy granularity.
 572       __ align32();
 573       __ BIND(L_main_loop_64bytes);
 574          copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
 575          copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
 576          copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
 577          __ addptr(temp4, loop_size[shift]);
 578          __ subq(temp1, loop_size[shift]);
 579          __ jcc(Assembler::greater, L_main_loop_64bytes);
 580 
 581       __ addq(temp1, loop_size[shift]);
 582       // Zero length check.
 583       __ jcc(Assembler::lessEqual, L_exit);
 584 
 585       __ BIND(L_tail64);
 586 
 587       // Tail handling using 64 byte [masked] vector copy operations.
 588       use64byteVector = true;
 589       arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
 590                                    temp4, temp3, use64byteVector, L_entry, L_exit);
 591     }
 592     __ BIND(L_exit);
 593   }
 594 
 595   address ucme_exit_pc = __ pc();
 596   // When called from generic_arraycopy r11 contains specific values
 597   // used during arraycopy epilogue, re-initializing r11.
 598   if (is_oop) {
 599     __ movq(r11, shift == 3 ? count : to);
 600   }
 601   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 602   restore_argument_regs(type);
 603   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 604   __ xorptr(rax, rax); // return 0
 605   __ vzeroupper();
 606   __ leave(); // required for proper stackwalking of RuntimeStub frame
 607   __ ret(0);
 608 
 609   return start;
 610 }
 611 
 612 
 613 // Inputs:
 614 //   c_rarg0   - source array address
 615 //   c_rarg1   - destination array address
 616 //   c_rarg2   - element count, treated as ssize_t, can be zero
 617 //
 618 //
 619 address StubGenerator::generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
 620                                                           address nooverlap_target, bool aligned,
 621                                                           bool is_oop, bool dest_uninitialized) {
 622   __ align(CodeEntryAlignment);
 623   StubCodeMark mark(this, "StubRoutines", name);
 624   address start = __ pc();
 625 
 626   int avx3threshold = VM_Version::avx3_threshold();
 627   bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
 628 
 629   Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
 630   Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
 631   const Register from        = rdi;  // source array address
 632   const Register to          = rsi;  // destination array address
 633   const Register count       = rdx;  // elements count
 634   const Register temp1       = r8;
 635   const Register temp2       = rcx;
 636   const Register temp3       = r11;
 637   const Register temp4       = rax;
 638   // End pointers are inclusive, and if count is not zero they point
 639   // to the last unit copied:  end_to[0] := end_from[0]
 640 
 641   __ enter(); // required for proper stackwalking of RuntimeStub frame
 642   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 643 
 644   if (entry != NULL) {
 645     *entry = __ pc();
 646      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 647     BLOCK_COMMENT("Entry:");
 648   }
 649 
 650   array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
 651 
 652   BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 653   BasicType type = is_oop ? T_OBJECT : type_vec[shift];
 654 
 655   setup_argument_regs(type);
 656 
 657   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 658   if (dest_uninitialized) {
 659     decorators |= IS_DEST_UNINITIALIZED;
 660   }
 661   if (aligned) {
 662     decorators |= ARRAYCOPY_ALIGNED;
 663   }
 664   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 665   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
 666   {
 667     // Type(shift)       byte(0), short(1), int(2),   long(3)
 668     int loop_size[]   = { 192,     96,       48,      24};
 669     int threshold[]   = { 4096,    2048,     1024,    512};
 670 
 671     // UnsafeCopyMemory page error: continue after ucm
 672     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
 673     // 'from', 'to' and 'count' are now valid
 674 
 675     // temp1 holds remaining count.
 676     __ movq(temp1, count);
 677 
 678     // Zero length check.
 679     __ BIND(L_tail);
 680     __ cmpq(temp1, 0);
 681     __ jcc(Assembler::lessEqual, L_exit);
 682 
 683     __ mov64(temp2, 0);
 684     __ movq(temp3, temp1);
 685     // Special cases using 32 byte [masked] vector copy operations.
 686     arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 687                                           temp4, use64byteVector, L_entry, L_exit);
 688 
 689     // PRE-MAIN-POST loop for aligned copy.
 690     __ BIND(L_entry);
 691 
 692     if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
 693       __ cmpq(temp1, threshold[shift]);
 694       __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
 695     }
 696 
 697     if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
 698       // Partial copy to make dst address 32 byte aligned.
 699       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 700       __ andq(temp2, 31);
 701       __ jcc(Assembler::equal, L_main_pre_loop);
 702 
 703       if (shift) {
 704         __ shrq(temp2, shift);
 705       }
 706       __ subq(temp1, temp2);
 707       copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
 708 
 709       __ cmpq(temp1, loop_size[shift]);
 710       __ jcc(Assembler::less, L_tail);
 711 
 712       __ BIND(L_main_pre_loop);
 713 
 714       // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
 715       __ align32();
 716       __ BIND(L_main_loop);
 717          copy64_avx(to, from, temp1, xmm1, true, shift, -64);
 718          copy64_avx(to, from, temp1, xmm1, true, shift, -128);
 719          copy64_avx(to, from, temp1, xmm1, true, shift, -192);
 720          __ subptr(temp1, loop_size[shift]);
 721          __ cmpq(temp1, loop_size[shift]);
 722          __ jcc(Assembler::greater, L_main_loop);
 723 
 724       // Tail loop.
 725       __ jmp(L_tail);
 726     }
 727 
 728     if (MaxVectorSize > 32) {
 729       __ BIND(L_pre_main_post_64);
 730       // Partial copy to make dst address 64 byte aligned.
 731       __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
 732       __ andq(temp2, 63);
 733       __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
 734 
 735       if (shift) {
 736         __ shrq(temp2, shift);
 737       }
 738       __ subq(temp1, temp2);
 739       copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
 740 
 741       __ cmpq(temp1, loop_size[shift]);
 742       __ jcc(Assembler::less, L_tail64);
 743 
 744       __ BIND(L_main_pre_loop_64bytes);
 745 
 746       // Main loop with aligned copy block size of 192 bytes at
 747       // 64 byte copy granularity.
 748       __ align32();
 749       __ BIND(L_main_loop_64bytes);
 750          copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
 751          copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
 752          copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
 753          __ subq(temp1, loop_size[shift]);
 754          __ cmpq(temp1, loop_size[shift]);
 755          __ jcc(Assembler::greater, L_main_loop_64bytes);
 756 
 757       // Zero length check.
 758       __ cmpq(temp1, 0);
 759       __ jcc(Assembler::lessEqual, L_exit);
 760 
 761       __ BIND(L_tail64);
 762 
 763       // Tail handling using 64 byte [masked] vector copy operations.
 764       use64byteVector = true;
 765       __ mov64(temp2, 0);
 766       __ movq(temp3, temp1);
 767       arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
 768                                             temp4, use64byteVector, L_entry, L_exit);
 769     }
 770     __ BIND(L_exit);
 771   }
 772   address ucme_exit_pc = __ pc();
 773   // When called from generic_arraycopy r11 contains specific values
 774   // used during arraycopy epilogue, re-initializing r11.
 775   if(is_oop) {
 776     __ movq(r11, count);
 777   }
 778   bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
 779   restore_argument_regs(type);
 780   INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
 781   __ xorptr(rax, rax); // return 0
 782   __ vzeroupper();
 783   __ leave(); // required for proper stackwalking of RuntimeStub frame
 784   __ ret(0);
 785 
 786   return start;
 787 }
 788 
 789 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
 790                                                  Register to, Register count, int shift,
 791                                                  Register index, Register temp,
 792                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
 793   Label L_entry_64, L_entry_96, L_entry_128;
 794   Label L_entry_160, L_entry_192;
 795 
 796   int size_mat[][6] = {
 797   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
 798   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
 799   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
 800   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
 801   };
 802 
 803   // Case A) Special case for length less than equal to 32 bytes.
 804   __ cmpq(count, size_mat[shift][0]);
 805   __ jccb(Assembler::greater, L_entry_64);
 806   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
 807   __ jmp(L_exit);
 808 
 809   // Case B) Special case for length less than equal to 64 bytes.
 810   __ BIND(L_entry_64);
 811   __ cmpq(count, size_mat[shift][1]);
 812   __ jccb(Assembler::greater, L_entry_96);
 813   copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
 814   __ jmp(L_exit);
 815 
 816   // Case C) Special case for length less than equal to 96 bytes.
 817   __ BIND(L_entry_96);
 818   __ cmpq(count, size_mat[shift][2]);
 819   __ jccb(Assembler::greater, L_entry_128);
 820   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
 821   __ subq(count, 64 >> shift);
 822   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
 823   __ jmp(L_exit);
 824 
 825   // Case D) Special case for length less than equal to 128 bytes.
 826   __ BIND(L_entry_128);
 827   __ cmpq(count, size_mat[shift][3]);
 828   __ jccb(Assembler::greater, L_entry_160);
 829   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
 830   copy32_avx(to, from, index, xmm, shift, 64);
 831   __ subq(count, 96 >> shift);
 832   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
 833   __ jmp(L_exit);
 834 
 835   // Case E) Special case for length less than equal to 160 bytes.
 836   __ BIND(L_entry_160);
 837   __ cmpq(count, size_mat[shift][4]);
 838   __ jccb(Assembler::greater, L_entry_192);
 839   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
 840   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
 841   __ subq(count, 128 >> shift);
 842   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
 843   __ jmp(L_exit);
 844 
 845   // Case F) Special case for length less than equal to 192 bytes.
 846   __ BIND(L_entry_192);
 847   __ cmpq(count, size_mat[shift][5]);
 848   __ jcc(Assembler::greater, L_entry);
 849   copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
 850   copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
 851   copy32_avx(to, from, index, xmm, shift, 128);
 852   __ subq(count, 160 >> shift);
 853   copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
 854   __ jmp(L_exit);
 855 }
 856 
 857 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
 858                                                            Register to, Register start_index, Register end_index,
 859                                                            Register count, int shift, Register temp,
 860                                                            bool use64byteVector, Label& L_entry, Label& L_exit) {
 861   Label L_entry_64, L_entry_96, L_entry_128;
 862   Label L_entry_160, L_entry_192;
 863   bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
 864 
 865   int size_mat[][6] = {
 866   /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
 867   /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
 868   /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
 869   /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
 870   };
 871 
 872   // Case A) Special case for length less than equal to 32 bytes.
 873   __ cmpq(count, size_mat[shift][0]);
 874   __ jccb(Assembler::greater, L_entry_64);
 875   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
 876   __ jmp(L_exit);
 877 
 878   // Case B) Special case for length less than equal to 64 bytes.
 879   __ BIND(L_entry_64);
 880   __ cmpq(count, size_mat[shift][1]);
 881   __ jccb(Assembler::greater, L_entry_96);
 882   if (avx3) {
 883      copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
 884   } else {
 885      copy32_avx(to, from, end_index, xmm, shift, -32);
 886      __ subq(count, 32 >> shift);
 887      copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
 888   }
 889   __ jmp(L_exit);
 890 
 891   // Case C) Special case for length less than equal to 96 bytes.
 892   __ BIND(L_entry_96);
 893   __ cmpq(count, size_mat[shift][2]);
 894   __ jccb(Assembler::greater, L_entry_128);
 895   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
 896   __ subq(count, 64 >> shift);
 897   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
 898   __ jmp(L_exit);
 899 
 900   // Case D) Special case for length less than equal to 128 bytes.
 901   __ BIND(L_entry_128);
 902   __ cmpq(count, size_mat[shift][3]);
 903   __ jccb(Assembler::greater, L_entry_160);
 904   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
 905   copy32_avx(to, from, end_index, xmm, shift, -96);
 906   __ subq(count, 96 >> shift);
 907   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
 908   __ jmp(L_exit);
 909 
 910   // Case E) Special case for length less than equal to 160 bytes.
 911   __ BIND(L_entry_160);
 912   __ cmpq(count, size_mat[shift][4]);
 913   __ jccb(Assembler::greater, L_entry_192);
 914   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
 915   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
 916   __ subq(count, 128 >> shift);
 917   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
 918   __ jmp(L_exit);
 919 
 920   // Case F) Special case for length less than equal to 192 bytes.
 921   __ BIND(L_entry_192);
 922   __ cmpq(count, size_mat[shift][5]);
 923   __ jcc(Assembler::greater, L_entry);
 924   copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
 925   copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
 926   copy32_avx(to, from, end_index, xmm, shift, -160);
 927   __ subq(count, 160 >> shift);
 928   copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
 929   __ jmp(L_exit);
 930 }
 931 
 932 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
 933                                        KRegister mask, Register length, Register index,
 934                                        Register temp, int shift, int offset,
 935                                        bool use64byteVector) {
 936   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 937   assert(MaxVectorSize >= 32, "vector length should be >= 32");
 938   if (!use64byteVector) {
 939     copy32_avx(dst, src, index, xmm, shift, offset);
 940     __ subptr(length, 32 >> shift);
 941     copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
 942   } else {
 943     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
 944     assert(MaxVectorSize == 64, "vector length != 64");
 945     __ mov64(temp, -1L);
 946     __ bzhiq(temp, temp, length);
 947     __ kmovql(mask, temp);
 948     __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
 949     __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
 950   }
 951 }
 952 
 953 
 954 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
 955                                        KRegister mask, Register length, Register index,
 956                                        Register temp, int shift, int offset) {
 957   assert(MaxVectorSize >= 32, "vector length should be >= 32");
 958   BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
 959   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
 960   __ mov64(temp, -1L);
 961   __ bzhiq(temp, temp, length);
 962   __ kmovql(mask, temp);
 963   __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
 964   __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
 965 }
 966 
 967 
 968 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
 969                                 int shift, int offset) {
 970   assert(MaxVectorSize >= 32, "vector length should be >= 32");
 971   Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
 972   __ vmovdqu(xmm, Address(src, index, scale, offset));
 973   __ vmovdqu(Address(dst, index, scale, offset), xmm);
 974 }
 975 
 976 
 977 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
 978                                 bool conjoint, int shift, int offset, bool use64byteVector) {
 979   assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
 980   if (!use64byteVector) {
 981     if (conjoint) {
 982       copy32_avx(dst, src, index, xmm, shift, offset+32);
 983       copy32_avx(dst, src, index, xmm, shift, offset);
 984     } else {
 985       copy32_avx(dst, src, index, xmm, shift, offset);
 986       copy32_avx(dst, src, index, xmm, shift, offset+32);
 987     }
 988   } else {
 989     Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
 990     __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
 991     __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
 992   }
 993 }
 994 
 995 #endif // COMPILER2_OR_JVMCI
 996 
 997 
 998 // Arguments:
 999 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1000 //             ignored
1001 //   name    - stub name string
1002 //
1003 // Inputs:
1004 //   c_rarg0   - source array address
1005 //   c_rarg1   - destination array address
1006 //   c_rarg2   - element count, treated as ssize_t, can be zero
1007 //
1008 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1009 // we let the hardware handle it.  The one to eight bytes within words,
1010 // dwords or qwords that span cache line boundaries will still be loaded
1011 // and stored atomically.
1012 //
1013 // Side Effects:
1014 //   disjoint_byte_copy_entry is set to the no-overlap entry point
1015 //   used by generate_conjoint_byte_copy().
1016 //
1017 address StubGenerator::generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1018 #if COMPILER2_OR_JVMCI
1019   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1020      return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1021                                                aligned, false, false);
1022   }
1023 #endif
1024   __ align(CodeEntryAlignment);
1025   StubCodeMark mark(this, "StubRoutines", name);
1026   address start = __ pc();
1027 
1028   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1029   Label L_copy_byte, L_exit;
1030   const Register from        = rdi;  // source array address
1031   const Register to          = rsi;  // destination array address
1032   const Register count       = rdx;  // elements count
1033   const Register byte_count  = rcx;
1034   const Register qword_count = count;
1035   const Register end_from    = from; // source array end address
1036   const Register end_to      = to;   // destination array end address
1037   // End pointers are inclusive, and if count is not zero they point
1038   // to the last unit copied:  end_to[0] := end_from[0]
1039 
1040   __ enter(); // required for proper stackwalking of RuntimeStub frame
1041   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1042 
1043   if (entry != NULL) {
1044     *entry = __ pc();
1045      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1046     BLOCK_COMMENT("Entry:");
1047   }
1048 
1049   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1050                     // r9 and r10 may be used to save non-volatile registers
1051 
1052   {
1053     // UnsafeCopyMemory page error: continue after ucm
1054     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1055     // 'from', 'to' and 'count' are now valid
1056     __ movptr(byte_count, count);
1057     __ shrptr(count, 3); // count => qword_count
1058 
1059     // Copy from low to high addresses.  Use 'to' as scratch.
1060     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1061     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1062     __ negptr(qword_count); // make the count negative
1063     __ jmp(L_copy_bytes);
1064 
1065     // Copy trailing qwords
1066   __ BIND(L_copy_8_bytes);
1067     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1068     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1069     __ increment(qword_count);
1070     __ jcc(Assembler::notZero, L_copy_8_bytes);
1071 
1072     // Check for and copy trailing dword
1073   __ BIND(L_copy_4_bytes);
1074     __ testl(byte_count, 4);
1075     __ jccb(Assembler::zero, L_copy_2_bytes);
1076     __ movl(rax, Address(end_from, 8));
1077     __ movl(Address(end_to, 8), rax);
1078 
1079     __ addptr(end_from, 4);
1080     __ addptr(end_to, 4);
1081 
1082     // Check for and copy trailing word
1083   __ BIND(L_copy_2_bytes);
1084     __ testl(byte_count, 2);
1085     __ jccb(Assembler::zero, L_copy_byte);
1086     __ movw(rax, Address(end_from, 8));
1087     __ movw(Address(end_to, 8), rax);
1088 
1089     __ addptr(end_from, 2);
1090     __ addptr(end_to, 2);
1091 
1092     // Check for and copy trailing byte
1093   __ BIND(L_copy_byte);
1094     __ testl(byte_count, 1);
1095     __ jccb(Assembler::zero, L_exit);
1096     __ movb(rax, Address(end_from, 8));
1097     __ movb(Address(end_to, 8), rax);
1098   }
1099 __ BIND(L_exit);
1100   address ucme_exit_pc = __ pc();
1101   restore_arg_regs();
1102   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1103   __ xorptr(rax, rax); // return 0
1104   __ vzeroupper();
1105   __ leave(); // required for proper stackwalking of RuntimeStub frame
1106   __ ret(0);
1107 
1108   {
1109     UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1110     // Copy in multi-bytes chunks
1111     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1112     __ jmp(L_copy_4_bytes);
1113   }
1114   return start;
1115 }
1116 
1117 
1118 // Arguments:
1119 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1120 //             ignored
1121 //   name    - stub name string
1122 //
1123 // Inputs:
1124 //   c_rarg0   - source array address
1125 //   c_rarg1   - destination array address
1126 //   c_rarg2   - element count, treated as ssize_t, can be zero
1127 //
1128 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1129 // we let the hardware handle it.  The one to eight bytes within words,
1130 // dwords or qwords that span cache line boundaries will still be loaded
1131 // and stored atomically.
1132 //
1133 address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1134                                                    address* entry, const char *name) {
1135 #if COMPILER2_OR_JVMCI
1136   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1137      return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1138                                                nooverlap_target, aligned, false, false);
1139   }
1140 #endif
1141   __ align(CodeEntryAlignment);
1142   StubCodeMark mark(this, "StubRoutines", name);
1143   address start = __ pc();
1144 
1145   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1146   const Register from        = rdi;  // source array address
1147   const Register to          = rsi;  // destination array address
1148   const Register count       = rdx;  // elements count
1149   const Register byte_count  = rcx;
1150   const Register qword_count = count;
1151 
1152   __ enter(); // required for proper stackwalking of RuntimeStub frame
1153   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1154 
1155   if (entry != NULL) {
1156     *entry = __ pc();
1157     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1158     BLOCK_COMMENT("Entry:");
1159   }
1160 
1161   array_overlap_test(nooverlap_target, Address::times_1);
1162   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1163                     // r9 and r10 may be used to save non-volatile registers
1164 
1165   {
1166     // UnsafeCopyMemory page error: continue after ucm
1167     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1168     // 'from', 'to' and 'count' are now valid
1169     __ movptr(byte_count, count);
1170     __ shrptr(count, 3);   // count => qword_count
1171 
1172     // Copy from high to low addresses.
1173 
1174     // Check for and copy trailing byte
1175     __ testl(byte_count, 1);
1176     __ jcc(Assembler::zero, L_copy_2_bytes);
1177     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1178     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1179     __ decrement(byte_count); // Adjust for possible trailing word
1180 
1181     // Check for and copy trailing word
1182   __ BIND(L_copy_2_bytes);
1183     __ testl(byte_count, 2);
1184     __ jcc(Assembler::zero, L_copy_4_bytes);
1185     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1186     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1187 
1188     // Check for and copy trailing dword
1189   __ BIND(L_copy_4_bytes);
1190     __ testl(byte_count, 4);
1191     __ jcc(Assembler::zero, L_copy_bytes);
1192     __ movl(rax, Address(from, qword_count, Address::times_8));
1193     __ movl(Address(to, qword_count, Address::times_8), rax);
1194     __ jmp(L_copy_bytes);
1195 
1196     // Copy trailing qwords
1197   __ BIND(L_copy_8_bytes);
1198     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1199     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1200     __ decrement(qword_count);
1201     __ jcc(Assembler::notZero, L_copy_8_bytes);
1202   }
1203   restore_arg_regs();
1204   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1205   __ xorptr(rax, rax); // return 0
1206   __ vzeroupper();
1207   __ leave(); // required for proper stackwalking of RuntimeStub frame
1208   __ ret(0);
1209 
1210   {
1211     // UnsafeCopyMemory page error: continue after ucm
1212     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1213     // Copy in multi-bytes chunks
1214     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1215   }
1216   restore_arg_regs();
1217   INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1218   __ xorptr(rax, rax); // return 0
1219   __ vzeroupper();
1220   __ leave(); // required for proper stackwalking of RuntimeStub frame
1221   __ ret(0);
1222 
1223   return start;
1224 }
1225 
1226 
1227 // Arguments:
1228 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1229 //             ignored
1230 //   name    - stub name string
1231 //
1232 // Inputs:
1233 //   c_rarg0   - source array address
1234 //   c_rarg1   - destination array address
1235 //   c_rarg2   - element count, treated as ssize_t, can be zero
1236 //
1237 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1238 // let the hardware handle it.  The two or four words within dwords
1239 // or qwords that span cache line boundaries will still be loaded
1240 // and stored atomically.
1241 //
1242 // Side Effects:
1243 //   disjoint_short_copy_entry is set to the no-overlap entry point
1244 //   used by generate_conjoint_short_copy().
1245 //
1246 address StubGenerator::generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1247 #if COMPILER2_OR_JVMCI
1248   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1249      return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
1250                                                aligned, false, false);
1251   }
1252 #endif
1253 
1254   __ align(CodeEntryAlignment);
1255   StubCodeMark mark(this, "StubRoutines", name);
1256   address start = __ pc();
1257 
1258   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1259   const Register from        = rdi;  // source array address
1260   const Register to          = rsi;  // destination array address
1261   const Register count       = rdx;  // elements count
1262   const Register word_count  = rcx;
1263   const Register qword_count = count;
1264   const Register end_from    = from; // source array end address
1265   const Register end_to      = to;   // destination array end address
1266   // End pointers are inclusive, and if count is not zero they point
1267   // to the last unit copied:  end_to[0] := end_from[0]
1268 
1269   __ enter(); // required for proper stackwalking of RuntimeStub frame
1270   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1271 
1272   if (entry != NULL) {
1273     *entry = __ pc();
1274     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1275     BLOCK_COMMENT("Entry:");
1276   }
1277 
1278   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1279                     // r9 and r10 may be used to save non-volatile registers
1280 
1281   {
1282     // UnsafeCopyMemory page error: continue after ucm
1283     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1284     // 'from', 'to' and 'count' are now valid
1285     __ movptr(word_count, count);
1286     __ shrptr(count, 2); // count => qword_count
1287 
1288     // Copy from low to high addresses.  Use 'to' as scratch.
1289     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1290     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1291     __ negptr(qword_count);
1292     __ jmp(L_copy_bytes);
1293 
1294     // Copy trailing qwords
1295   __ BIND(L_copy_8_bytes);
1296     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1297     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1298     __ increment(qword_count);
1299     __ jcc(Assembler::notZero, L_copy_8_bytes);
1300 
1301     // Original 'dest' is trashed, so we can't use it as a
1302     // base register for a possible trailing word copy
1303 
1304     // Check for and copy trailing dword
1305   __ BIND(L_copy_4_bytes);
1306     __ testl(word_count, 2);
1307     __ jccb(Assembler::zero, L_copy_2_bytes);
1308     __ movl(rax, Address(end_from, 8));
1309     __ movl(Address(end_to, 8), rax);
1310 
1311     __ addptr(end_from, 4);
1312     __ addptr(end_to, 4);
1313 
1314     // Check for and copy trailing word
1315   __ BIND(L_copy_2_bytes);
1316     __ testl(word_count, 1);
1317     __ jccb(Assembler::zero, L_exit);
1318     __ movw(rax, Address(end_from, 8));
1319     __ movw(Address(end_to, 8), rax);
1320   }
1321 __ BIND(L_exit);
1322   address ucme_exit_pc = __ pc();
1323   restore_arg_regs();
1324   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1325   __ xorptr(rax, rax); // return 0
1326   __ vzeroupper();
1327   __ leave(); // required for proper stackwalking of RuntimeStub frame
1328   __ ret(0);
1329 
1330   {
1331     UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1332     // Copy in multi-bytes chunks
1333     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1334     __ jmp(L_copy_4_bytes);
1335   }
1336 
1337   return start;
1338 }
1339 
1340 
1341 address StubGenerator::generate_fill(BasicType t, bool aligned, const char *name) {
1342   __ align(CodeEntryAlignment);
1343   StubCodeMark mark(this, "StubRoutines", name);
1344   address start = __ pc();
1345 
1346   BLOCK_COMMENT("Entry:");
1347 
1348   const Register to       = c_rarg0;  // destination array address
1349   const Register value    = c_rarg1;  // value
1350   const Register count    = c_rarg2;  // elements count
1351   __ mov(r11, count);
1352 
1353   __ enter(); // required for proper stackwalking of RuntimeStub frame
1354 
1355   __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1356 
1357   __ vzeroupper();
1358   __ leave(); // required for proper stackwalking of RuntimeStub frame
1359   __ ret(0);
1360 
1361   return start;
1362 }
1363 
1364 
1365 // Arguments:
1366 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1367 //             ignored
1368 //   name    - stub name string
1369 //
1370 // Inputs:
1371 //   c_rarg0   - source array address
1372 //   c_rarg1   - destination array address
1373 //   c_rarg2   - element count, treated as ssize_t, can be zero
1374 //
1375 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1376 // let the hardware handle it.  The two or four words within dwords
1377 // or qwords that span cache line boundaries will still be loaded
1378 // and stored atomically.
1379 //
1380 address StubGenerator::generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1381                                                     address *entry, const char *name) {
1382 #if COMPILER2_OR_JVMCI
1383   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1384      return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
1385                                                nooverlap_target, aligned, false, false);
1386   }
1387 #endif
1388   __ align(CodeEntryAlignment);
1389   StubCodeMark mark(this, "StubRoutines", name);
1390   address start = __ pc();
1391 
1392   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1393   const Register from        = rdi;  // source array address
1394   const Register to          = rsi;  // destination array address
1395   const Register count       = rdx;  // elements count
1396   const Register word_count  = rcx;
1397   const Register qword_count = count;
1398 
1399   __ enter(); // required for proper stackwalking of RuntimeStub frame
1400   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1401 
1402   if (entry != NULL) {
1403     *entry = __ pc();
1404     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1405     BLOCK_COMMENT("Entry:");
1406   }
1407 
1408   array_overlap_test(nooverlap_target, Address::times_2);
1409   setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1410                     // r9 and r10 may be used to save non-volatile registers
1411 
1412   {
1413     // UnsafeCopyMemory page error: continue after ucm
1414     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1415     // 'from', 'to' and 'count' are now valid
1416     __ movptr(word_count, count);
1417     __ shrptr(count, 2); // count => qword_count
1418 
1419     // Copy from high to low addresses.  Use 'to' as scratch.
1420 
1421     // Check for and copy trailing word
1422     __ testl(word_count, 1);
1423     __ jccb(Assembler::zero, L_copy_4_bytes);
1424     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1425     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1426 
1427    // Check for and copy trailing dword
1428   __ BIND(L_copy_4_bytes);
1429     __ testl(word_count, 2);
1430     __ jcc(Assembler::zero, L_copy_bytes);
1431     __ movl(rax, Address(from, qword_count, Address::times_8));
1432     __ movl(Address(to, qword_count, Address::times_8), rax);
1433     __ jmp(L_copy_bytes);
1434 
1435     // Copy trailing qwords
1436   __ BIND(L_copy_8_bytes);
1437     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1438     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1439     __ decrement(qword_count);
1440     __ jcc(Assembler::notZero, L_copy_8_bytes);
1441   }
1442   restore_arg_regs();
1443   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1444   __ xorptr(rax, rax); // return 0
1445   __ vzeroupper();
1446   __ leave(); // required for proper stackwalking of RuntimeStub frame
1447   __ ret(0);
1448 
1449   {
1450     // UnsafeCopyMemory page error: continue after ucm
1451     UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1452     // Copy in multi-bytes chunks
1453     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1454   }
1455   restore_arg_regs();
1456   INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1457   __ xorptr(rax, rax); // return 0
1458   __ vzeroupper();
1459   __ leave(); // required for proper stackwalking of RuntimeStub frame
1460   __ ret(0);
1461 
1462   return start;
1463 }
1464 
1465 
1466 // Arguments:
1467 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1468 //             ignored
1469 //   is_oop  - true => oop array, so generate store check code
1470 //   name    - stub name string
1471 //
1472 // Inputs:
1473 //   c_rarg0   - source array address
1474 //   c_rarg1   - destination array address
1475 //   c_rarg2   - element count, treated as ssize_t, can be zero
1476 //
1477 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1478 // the hardware handle it.  The two dwords within qwords that span
1479 // cache line boundaries will still be loaded and stored atomically.
1480 //
1481 // Side Effects:
1482 //   disjoint_int_copy_entry is set to the no-overlap entry point
1483 //   used by generate_conjoint_int_oop_copy().
1484 //
1485 address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1486                                                       const char *name, bool dest_uninitialized) {
1487 #if COMPILER2_OR_JVMCI
1488   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1489      return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
1490                                                aligned, is_oop, dest_uninitialized);
1491   }
1492 #endif
1493 
1494   __ align(CodeEntryAlignment);
1495   StubCodeMark mark(this, "StubRoutines", name);
1496   address start = __ pc();
1497 
1498   Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1499   const Register from        = rdi;  // source array address
1500   const Register to          = rsi;  // destination array address
1501   const Register count       = rdx;  // elements count
1502   const Register dword_count = rcx;
1503   const Register qword_count = count;
1504   const Register end_from    = from; // source array end address
1505   const Register end_to      = to;   // destination array end address
1506   // End pointers are inclusive, and if count is not zero they point
1507   // to the last unit copied:  end_to[0] := end_from[0]
1508 
1509   __ enter(); // required for proper stackwalking of RuntimeStub frame
1510   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1511 
1512   if (entry != NULL) {
1513     *entry = __ pc();
1514     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1515     BLOCK_COMMENT("Entry:");
1516   }
1517 
1518   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1519                                  // r9 is used to save r15_thread
1520 
1521   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1522   if (dest_uninitialized) {
1523     decorators |= IS_DEST_UNINITIALIZED;
1524   }
1525   if (aligned) {
1526     decorators |= ARRAYCOPY_ALIGNED;
1527   }
1528 
1529   BasicType type = is_oop ? T_OBJECT : T_INT;
1530   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1531   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1532 
1533   {
1534     // UnsafeCopyMemory page error: continue after ucm
1535     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1536     // 'from', 'to' and 'count' are now valid
1537     __ movptr(dword_count, count);
1538     __ shrptr(count, 1); // count => qword_count
1539 
1540     // Copy from low to high addresses.  Use 'to' as scratch.
1541     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1542     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1543     __ negptr(qword_count);
1544     __ jmp(L_copy_bytes);
1545 
1546     // Copy trailing qwords
1547   __ BIND(L_copy_8_bytes);
1548     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1549     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1550     __ increment(qword_count);
1551     __ jcc(Assembler::notZero, L_copy_8_bytes);
1552 
1553     // Check for and copy trailing dword
1554   __ BIND(L_copy_4_bytes);
1555     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1556     __ jccb(Assembler::zero, L_exit);
1557     __ movl(rax, Address(end_from, 8));
1558     __ movl(Address(end_to, 8), rax);
1559   }
1560 __ BIND(L_exit);
1561   address ucme_exit_pc = __ pc();
1562   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1563   restore_arg_regs_using_thread();
1564   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1565   __ vzeroupper();
1566   __ xorptr(rax, rax); // return 0
1567   __ leave(); // required for proper stackwalking of RuntimeStub frame
1568   __ ret(0);
1569 
1570   {
1571     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
1572     // Copy in multi-bytes chunks
1573     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1574     __ jmp(L_copy_4_bytes);
1575   }
1576 
1577   return start;
1578 }
1579 
1580 
1581 // Arguments:
1582 //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1583 //             ignored
1584 //   is_oop  - true => oop array, so generate store check code
1585 //   name    - stub name string
1586 //
1587 // Inputs:
1588 //   c_rarg0   - source array address
1589 //   c_rarg1   - destination array address
1590 //   c_rarg2   - element count, treated as ssize_t, can be zero
1591 //
1592 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1593 // the hardware handle it.  The two dwords within qwords that span
1594 // cache line boundaries will still be loaded and stored atomically.
1595 //
1596 address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1597                                                       address *entry, const char *name,
1598                                                       bool dest_uninitialized) {
1599 #if COMPILER2_OR_JVMCI
1600   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1601      return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
1602                                                nooverlap_target, aligned, is_oop, dest_uninitialized);
1603   }
1604 #endif
1605   __ align(CodeEntryAlignment);
1606   StubCodeMark mark(this, "StubRoutines", name);
1607   address start = __ pc();
1608 
1609   Label L_copy_bytes, L_copy_8_bytes, L_exit;
1610   const Register from        = rdi;  // source array address
1611   const Register to          = rsi;  // destination array address
1612   const Register count       = rdx;  // elements count
1613   const Register dword_count = rcx;
1614   const Register qword_count = count;
1615 
1616   __ enter(); // required for proper stackwalking of RuntimeStub frame
1617   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1618 
1619   if (entry != NULL) {
1620     *entry = __ pc();
1621      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1622     BLOCK_COMMENT("Entry:");
1623   }
1624 
1625   array_overlap_test(nooverlap_target, Address::times_4);
1626   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1627                                  // r9 is used to save r15_thread
1628 
1629   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1630   if (dest_uninitialized) {
1631     decorators |= IS_DEST_UNINITIALIZED;
1632   }
1633   if (aligned) {
1634     decorators |= ARRAYCOPY_ALIGNED;
1635   }
1636 
1637   BasicType type = is_oop ? T_OBJECT : T_INT;
1638   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1639   // no registers are destroyed by this call
1640   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1641 
1642   assert_clean_int(count, rax); // Make sure 'count' is clean int.
1643   {
1644     // UnsafeCopyMemory page error: continue after ucm
1645     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1646     // 'from', 'to' and 'count' are now valid
1647     __ movptr(dword_count, count);
1648     __ shrptr(count, 1); // count => qword_count
1649 
1650     // Copy from high to low addresses.  Use 'to' as scratch.
1651 
1652     // Check for and copy trailing dword
1653     __ testl(dword_count, 1);
1654     __ jcc(Assembler::zero, L_copy_bytes);
1655     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1656     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1657     __ jmp(L_copy_bytes);
1658 
1659     // Copy trailing qwords
1660   __ BIND(L_copy_8_bytes);
1661     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1662     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1663     __ decrement(qword_count);
1664     __ jcc(Assembler::notZero, L_copy_8_bytes);
1665   }
1666   if (is_oop) {
1667     __ jmp(L_exit);
1668   }
1669   restore_arg_regs_using_thread();
1670   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1671   __ xorptr(rax, rax); // return 0
1672   __ vzeroupper();
1673   __ leave(); // required for proper stackwalking of RuntimeStub frame
1674   __ ret(0);
1675 
1676   {
1677     // UnsafeCopyMemory page error: continue after ucm
1678     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1679     // Copy in multi-bytes chunks
1680     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1681   }
1682 
1683 __ BIND(L_exit);
1684   bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1685   restore_arg_regs_using_thread();
1686   INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1687   __ xorptr(rax, rax); // return 0
1688   __ vzeroupper();
1689   __ leave(); // required for proper stackwalking of RuntimeStub frame
1690   __ ret(0);
1691 
1692   return start;
1693 }
1694 
1695 
1696 // Arguments:
1697 //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1698 //             ignored
1699 //   is_oop  - true => oop array, so generate store check code
1700 //   name    - stub name string
1701 //
1702 // Inputs:
1703 //   c_rarg0   - source array address
1704 //   c_rarg1   - destination array address
1705 //   c_rarg2   - element count, treated as ssize_t, can be zero
1706 //
1707  // Side Effects:
1708 //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1709 //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1710 //
1711 address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
1712                                                        const char *name, bool dest_uninitialized) {
1713 #if COMPILER2_OR_JVMCI
1714   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1715      return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
1716                                                aligned, is_oop, dest_uninitialized);
1717   }
1718 #endif
1719   __ align(CodeEntryAlignment);
1720   StubCodeMark mark(this, "StubRoutines", name);
1721   address start = __ pc();
1722 
1723   Label L_copy_bytes, L_copy_8_bytes, L_exit;
1724   const Register from        = rdi;  // source array address
1725   const Register to          = rsi;  // destination array address
1726   const Register qword_count = rdx;  // elements count
1727   const Register end_from    = from; // source array end address
1728   const Register end_to      = rcx;  // destination array end address
1729   const Register saved_count = r11;
1730   // End pointers are inclusive, and if count is not zero they point
1731   // to the last unit copied:  end_to[0] := end_from[0]
1732 
1733   __ enter(); // required for proper stackwalking of RuntimeStub frame
1734   // Save no-overlap entry point for generate_conjoint_long_oop_copy()
1735   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1736 
1737   if (entry != NULL) {
1738     *entry = __ pc();
1739     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1740     BLOCK_COMMENT("Entry:");
1741   }
1742 
1743   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1744                                    // r9 is used to save r15_thread
1745   // 'from', 'to' and 'qword_count' are now valid
1746 
1747   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1748   if (dest_uninitialized) {
1749     decorators |= IS_DEST_UNINITIALIZED;
1750   }
1751   if (aligned) {
1752     decorators |= ARRAYCOPY_ALIGNED;
1753   }
1754 
1755   BasicType type = is_oop ? T_OBJECT : T_LONG;
1756   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1757   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
1758   {
1759     // UnsafeCopyMemory page error: continue after ucm
1760     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1761 
1762     // Copy from low to high addresses.  Use 'to' as scratch.
1763     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1764     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1765     __ negptr(qword_count);
1766     __ jmp(L_copy_bytes);
1767 
1768     // Copy trailing qwords
1769   __ BIND(L_copy_8_bytes);
1770     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1771     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1772     __ increment(qword_count);
1773     __ jcc(Assembler::notZero, L_copy_8_bytes);
1774   }
1775   if (is_oop) {
1776     __ jmp(L_exit);
1777   } else {
1778     restore_arg_regs_using_thread();
1779     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1780     __ xorptr(rax, rax); // return 0
1781     __ vzeroupper();
1782     __ leave(); // required for proper stackwalking of RuntimeStub frame
1783     __ ret(0);
1784   }
1785 
1786   {
1787     // UnsafeCopyMemory page error: continue after ucm
1788     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1789     // Copy in multi-bytes chunks
1790     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1791   }
1792 
1793   __ BIND(L_exit);
1794   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
1795   restore_arg_regs_using_thread();
1796   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
1797                           SharedRuntime::_jlong_array_copy_ctr,
1798                  rscratch1); // Update counter after rscratch1 is free
1799   __ vzeroupper();
1800   __ xorptr(rax, rax); // return 0
1801   __ leave(); // required for proper stackwalking of RuntimeStub frame
1802   __ ret(0);
1803 
1804   return start;
1805 }
1806 
1807 
1808 // Arguments:
1809 //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1810 //             ignored
1811 //   is_oop  - true => oop array, so generate store check code
1812 //   name    - stub name string
1813 //
1814 // Inputs:
1815 //   c_rarg0   - source array address
1816 //   c_rarg1   - destination array address
1817 //   c_rarg2   - element count, treated as ssize_t, can be zero
1818 //
1819 address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1820                                                        address *entry, const char *name,
1821                                                        bool dest_uninitialized) {
1822 #if COMPILER2_OR_JVMCI
1823   if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1824      return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
1825                                                nooverlap_target, aligned, is_oop, dest_uninitialized);
1826   }
1827 #endif
1828   __ align(CodeEntryAlignment);
1829   StubCodeMark mark(this, "StubRoutines", name);
1830   address start = __ pc();
1831 
1832   Label L_copy_bytes, L_copy_8_bytes, L_exit;
1833   const Register from        = rdi;  // source array address
1834   const Register to          = rsi;  // destination array address
1835   const Register qword_count = rdx;  // elements count
1836   const Register saved_count = rcx;
1837 
1838   __ enter(); // required for proper stackwalking of RuntimeStub frame
1839   assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1840 
1841   if (entry != NULL) {
1842     *entry = __ pc();
1843     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1844     BLOCK_COMMENT("Entry:");
1845   }
1846 
1847   array_overlap_test(nooverlap_target, Address::times_8);
1848   setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1849                                  // r9 is used to save r15_thread
1850   // 'from', 'to' and 'qword_count' are now valid
1851 
1852   DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1853   if (dest_uninitialized) {
1854     decorators |= IS_DEST_UNINITIALIZED;
1855   }
1856   if (aligned) {
1857     decorators |= ARRAYCOPY_ALIGNED;
1858   }
1859 
1860   BasicType type = is_oop ? T_OBJECT : T_LONG;
1861   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1862   bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
1863   {
1864     // UnsafeCopyMemory page error: continue after ucm
1865     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1866 
1867     __ jmp(L_copy_bytes);
1868 
1869     // Copy trailing qwords
1870   __ BIND(L_copy_8_bytes);
1871     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1872     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1873     __ decrement(qword_count);
1874     __ jcc(Assembler::notZero, L_copy_8_bytes);
1875   }
1876   if (is_oop) {
1877     __ jmp(L_exit);
1878   } else {
1879     restore_arg_regs_using_thread();
1880     INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1881     __ xorptr(rax, rax); // return 0
1882     __ vzeroupper();
1883     __ leave(); // required for proper stackwalking of RuntimeStub frame
1884     __ ret(0);
1885   }
1886   {
1887     // UnsafeCopyMemory page error: continue after ucm
1888     UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1889 
1890     // Copy in multi-bytes chunks
1891     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1892   }
1893   __ BIND(L_exit);
1894   bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
1895   restore_arg_regs_using_thread();
1896   INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
1897                           SharedRuntime::_jlong_array_copy_ctr,
1898                  rscratch1); // Update counter after rscratch1 is free
1899   __ vzeroupper();
1900   __ xorptr(rax, rax); // return 0
1901   __ leave(); // required for proper stackwalking of RuntimeStub frame
1902   __ ret(0);
1903 
1904   return start;
1905 }
1906 
1907 
1908 // Helper for generating a dynamic type check.
1909 // Smashes no registers.
1910 void StubGenerator::generate_type_check(Register sub_klass,
1911                                         Register super_check_offset,
1912                                         Register super_klass,
1913                                         Label& L_success) {
1914   assert_different_registers(sub_klass, super_check_offset, super_klass);
1915 
1916   BLOCK_COMMENT("type_check:");
1917 
1918   Label L_miss;
1919 
1920   __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1921                                    super_check_offset);
1922   __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1923 
1924   // Fall through on failure!
1925   __ BIND(L_miss);
1926 }
1927 
1928 //
1929 //  Generate checkcasting array copy stub
1930 //
1931 //  Input:
1932 //    c_rarg0   - source array address
1933 //    c_rarg1   - destination array address
1934 //    c_rarg2   - element count, treated as ssize_t, can be zero
1935 //    c_rarg3   - size_t ckoff (super_check_offset)
1936 // not Win64
1937 //    c_rarg4   - oop ckval (super_klass)
1938 // Win64
1939 //    rsp+40    - oop ckval (super_klass)
1940 //
1941 //  Output:
1942 //    rax ==  0  -  success
1943 //    rax == -1^K - failure, where K is partial transfer count
1944 //
1945 address StubGenerator::generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized) {
1946 
1947   Label L_load_element, L_store_element, L_do_card_marks, L_done;
1948 
1949   // Input registers (after setup_arg_regs)
1950   const Register from        = rdi;   // source array address
1951   const Register to          = rsi;   // destination array address
1952   const Register length      = rdx;   // elements count
1953   const Register ckoff       = rcx;   // super_check_offset
1954   const Register ckval       = r8;    // super_klass
1955 
1956   // Registers used as temps (r13, r14 are save-on-entry)
1957   const Register end_from    = from;  // source array end address
1958   const Register end_to      = r13;   // destination array end address
1959   const Register count       = rdx;   // -(count_remaining)
1960   const Register r14_length  = r14;   // saved copy of length
1961   // End pointers are inclusive, and if length is not zero they point
1962   // to the last unit copied:  end_to[0] := end_from[0]
1963 
1964   const Register rax_oop    = rax;    // actual oop copied
1965   const Register r11_klass  = r11;    // oop._klass
1966 
1967   //---------------------------------------------------------------
1968   // Assembler stub will be used for this call to arraycopy
1969   // if the two arrays are subtypes of Object[] but the
1970   // destination array type is not equal to or a supertype
1971   // of the source type.  Each element must be separately
1972   // checked.
1973 
1974   __ align(CodeEntryAlignment);
1975   StubCodeMark mark(this, "StubRoutines", name);
1976   address start = __ pc();
1977 
1978   __ enter(); // required for proper stackwalking of RuntimeStub frame
1979 
1980 #ifdef ASSERT
1981   // caller guarantees that the arrays really are different
1982   // otherwise, we would have to make conjoint checks
1983   { Label L;
1984     array_overlap_test(L, TIMES_OOP);
1985     __ stop("checkcast_copy within a single array");
1986     __ bind(L);
1987   }
1988 #endif //ASSERT
1989 
1990   setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
1991                      // ckoff => rcx, ckval => r8
1992                      // r9 and r10 may be used to save non-volatile registers
1993 #ifdef _WIN64
1994   // last argument (#4) is on stack on Win64
1995   __ movptr(ckval, Address(rsp, 6 * wordSize));
1996 #endif
1997 
1998   // Caller of this entry point must set up the argument registers.
1999   if (entry != NULL) {
2000     *entry = __ pc();
2001     BLOCK_COMMENT("Entry:");
2002   }
2003 
2004   // allocate spill slots for r13, r14
2005   enum {
2006     saved_r13_offset,
2007     saved_r14_offset,
2008     saved_r10_offset,
2009     saved_rbp_offset
2010   };
2011   __ subptr(rsp, saved_rbp_offset * wordSize);
2012   __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2013   __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2014   __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2015 
2016 #ifdef ASSERT
2017     Label L2;
2018     __ get_thread(r14);
2019     __ cmpptr(r15_thread, r14);
2020     __ jcc(Assembler::equal, L2);
2021     __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2022     __ bind(L2);
2023 #endif // ASSERT
2024 
2025   // check that int operands are properly extended to size_t
2026   assert_clean_int(length, rax);
2027   assert_clean_int(ckoff, rax);
2028 
2029 #ifdef ASSERT
2030   BLOCK_COMMENT("assert consistent ckoff/ckval");
2031   // The ckoff and ckval must be mutually consistent,
2032   // even though caller generates both.
2033   { Label L;
2034     int sco_offset = in_bytes(Klass::super_check_offset_offset());
2035     __ cmpl(ckoff, Address(ckval, sco_offset));
2036     __ jcc(Assembler::equal, L);
2037     __ stop("super_check_offset inconsistent");
2038     __ bind(L);
2039   }
2040 #endif //ASSERT
2041 
2042   // Loop-invariant addresses.  They are exclusive end pointers.
2043   Address end_from_addr(from, length, TIMES_OOP, 0);
2044   Address   end_to_addr(to,   length, TIMES_OOP, 0);
2045   // Loop-variant addresses.  They assume post-incremented count < 0.
2046   Address from_element_addr(end_from, count, TIMES_OOP, 0);
2047   Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2048 
2049   DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2050   if (dest_uninitialized) {
2051     decorators |= IS_DEST_UNINITIALIZED;
2052   }
2053 
2054   BasicType type = T_OBJECT;
2055   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2056   bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2057 
2058   // Copy from low to high addresses, indexed from the end of each array.
2059   __ lea(end_from, end_from_addr);
2060   __ lea(end_to,   end_to_addr);
2061   __ movptr(r14_length, length);        // save a copy of the length
2062   assert(length == count, "");          // else fix next line:
2063   __ negptr(count);                     // negate and test the length
2064   __ jcc(Assembler::notZero, L_load_element);
2065 
2066   // Empty array:  Nothing to do.
2067   __ xorptr(rax, rax);                  // return 0 on (trivial) success
2068   __ jmp(L_done);
2069 
2070   // ======== begin loop ========
2071   // (Loop is rotated; its entry is L_load_element.)
2072   // Loop control:
2073   //   for (count = -count; count != 0; count++)
2074   // Base pointers src, dst are biased by 8*(count-1),to last element.
2075   __ align(OptoLoopAlignment);
2076 
2077   __ BIND(L_store_element);
2078   __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
2079   __ increment(count);               // increment the count toward zero
2080   __ jcc(Assembler::zero, L_do_card_marks);
2081 
2082   // ======== loop entry is here ========
2083   __ BIND(L_load_element);
2084   __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2085   __ testptr(rax_oop, rax_oop);
2086   __ jcc(Assembler::zero, L_store_element);
2087 
2088   __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2089   generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2090   // ======== end loop ========
2091 
2092   // It was a real error; we must depend on the caller to finish the job.
2093   // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2094   // Emit GC store barriers for the oops we have copied (r14 + rdx),
2095   // and report their number to the caller.
2096   assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2097   Label L_post_barrier;
2098   __ addptr(r14_length, count);     // K = (original - remaining) oops
2099   __ movptr(rax, r14_length);       // save the value
2100   __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2101   __ jccb(Assembler::notZero, L_post_barrier);
2102   __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2103 
2104   // Come here on success only.
2105   __ BIND(L_do_card_marks);
2106   __ xorptr(rax, rax);              // return 0 on success
2107 
2108   __ BIND(L_post_barrier);
2109   bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2110 
2111   // Common exit point (success or failure).
2112   __ BIND(L_done);
2113   __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2114   __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2115   __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2116   restore_arg_regs();
2117   INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2118   __ leave(); // required for proper stackwalking of RuntimeStub frame
2119   __ ret(0);
2120 
2121   return start;
2122 }
2123 
2124 
2125 //  Generate 'unsafe' array copy stub
2126 //  Though just as safe as the other stubs, it takes an unscaled
2127 //  size_t argument instead of an element count.
2128 //
2129 //  Input:
2130 //    c_rarg0   - source array address
2131 //    c_rarg1   - destination array address
2132 //    c_rarg2   - byte count, treated as ssize_t, can be zero
2133 //
2134 // Examines the alignment of the operands and dispatches
2135 // to a long, int, short, or byte copy loop.
2136 //
2137 address StubGenerator::generate_unsafe_copy(const char *name,
2138                                             address byte_copy_entry, address short_copy_entry,
2139                                             address int_copy_entry, address long_copy_entry) {
2140 
2141   Label L_long_aligned, L_int_aligned, L_short_aligned;
2142 
2143   // Input registers (before setup_arg_regs)
2144   const Register from        = c_rarg0;  // source array address
2145   const Register to          = c_rarg1;  // destination array address
2146   const Register size        = c_rarg2;  // byte count (size_t)
2147 
2148   // Register used as a temp
2149   const Register bits        = rax;      // test copy of low bits
2150 
2151   __ align(CodeEntryAlignment);
2152   StubCodeMark mark(this, "StubRoutines", name);
2153   address start = __ pc();
2154 
2155   __ enter(); // required for proper stackwalking of RuntimeStub frame
2156 
2157   // bump this on entry, not on exit:
2158   INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
2159 
2160   __ mov(bits, from);
2161   __ orptr(bits, to);
2162   __ orptr(bits, size);
2163 
2164   __ testb(bits, BytesPerLong-1);
2165   __ jccb(Assembler::zero, L_long_aligned);
2166 
2167   __ testb(bits, BytesPerInt-1);
2168   __ jccb(Assembler::zero, L_int_aligned);
2169 
2170   __ testb(bits, BytesPerShort-1);
2171   __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2172 
2173   __ BIND(L_short_aligned);
2174   __ shrptr(size, LogBytesPerShort); // size => short_count
2175   __ jump(RuntimeAddress(short_copy_entry));
2176 
2177   __ BIND(L_int_aligned);
2178   __ shrptr(size, LogBytesPerInt); // size => int_count
2179   __ jump(RuntimeAddress(int_copy_entry));
2180 
2181   __ BIND(L_long_aligned);
2182   __ shrptr(size, LogBytesPerLong); // size => qword_count
2183   __ jump(RuntimeAddress(long_copy_entry));
2184 
2185   return start;
2186 }
2187 
2188 
2189 // Perform range checks on the proposed arraycopy.
2190 // Kills temp, but nothing else.
2191 // Also, clean the sign bits of src_pos and dst_pos.
2192 void StubGenerator::arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2193                                            Register src_pos, // source position (c_rarg1)
2194                                            Register dst,     // destination array oo (c_rarg2)
2195                                            Register dst_pos, // destination position (c_rarg3)
2196                                            Register length,
2197                                            Register temp,
2198                                            Label& L_failed) {
2199   BLOCK_COMMENT("arraycopy_range_checks:");
2200 
2201   //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2202   __ movl(temp, length);
2203   __ addl(temp, src_pos);             // src_pos + length
2204   __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2205   __ jcc(Assembler::above, L_failed);
2206 
2207   //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2208   __ movl(temp, length);
2209   __ addl(temp, dst_pos);             // dst_pos + length
2210   __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2211   __ jcc(Assembler::above, L_failed);
2212 
2213   // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2214   // Move with sign extension can be used since they are positive.
2215   __ movslq(src_pos, src_pos);
2216   __ movslq(dst_pos, dst_pos);
2217 
2218   BLOCK_COMMENT("arraycopy_range_checks done");
2219 }
2220 
2221 
2222 //  Generate generic array copy stubs
2223 //
2224 //  Input:
2225 //    c_rarg0    -  src oop
2226 //    c_rarg1    -  src_pos (32-bits)
2227 //    c_rarg2    -  dst oop
2228 //    c_rarg3    -  dst_pos (32-bits)
2229 // not Win64
2230 //    c_rarg4    -  element count (32-bits)
2231 // Win64
2232 //    rsp+40     -  element count (32-bits)
2233 //
2234 //  Output:
2235 //    rax ==  0  -  success
2236 //    rax == -1^K - failure, where K is partial transfer count
2237 //
2238 address StubGenerator::generate_generic_copy(const char *name,
2239                                              address byte_copy_entry, address short_copy_entry,
2240                                              address int_copy_entry, address oop_copy_entry,
2241                                              address long_copy_entry, address checkcast_copy_entry) {
2242 
2243   Label L_failed, L_failed_0, L_objArray;
2244   Label L_copy_shorts, L_copy_ints, L_copy_longs;
2245 
2246   // Input registers
2247   const Register src        = c_rarg0;  // source array oop
2248   const Register src_pos    = c_rarg1;  // source position
2249   const Register dst        = c_rarg2;  // destination array oop
2250   const Register dst_pos    = c_rarg3;  // destination position
2251 #ifndef _WIN64
2252   const Register length     = c_rarg4;
2253   const Register rklass_tmp = r9;  // load_klass
2254 #else
2255   const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
2256   const Register rklass_tmp = rdi;  // load_klass
2257 #endif
2258 
2259   { int modulus = CodeEntryAlignment;
2260     int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2261     int advance = target - (__ offset() % modulus);
2262     if (advance < 0)  advance += modulus;
2263     if (advance > 0)  __ nop(advance);
2264   }
2265   StubCodeMark mark(this, "StubRoutines", name);
2266 
2267   // Short-hop target to L_failed.  Makes for denser prologue code.
2268   __ BIND(L_failed_0);
2269   __ jmp(L_failed);
2270   assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2271 
2272   __ align(CodeEntryAlignment);
2273   address start = __ pc();
2274 
2275   __ enter(); // required for proper stackwalking of RuntimeStub frame
2276 
2277 #ifdef _WIN64
2278   __ push(rklass_tmp); // rdi is callee-save on Windows
2279 #endif
2280 
2281   // bump this on entry, not on exit:
2282   INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
2283 
2284   //-----------------------------------------------------------------------
2285   // Assembler stub will be used for this call to arraycopy
2286   // if the following conditions are met:
2287   //
2288   // (1) src and dst must not be null.
2289   // (2) src_pos must not be negative.
2290   // (3) dst_pos must not be negative.
2291   // (4) length  must not be negative.
2292   // (5) src klass and dst klass should be the same and not NULL.
2293   // (6) src and dst should be arrays.
2294   // (7) src_pos + length must not exceed length of src.
2295   // (8) dst_pos + length must not exceed length of dst.
2296   //
2297 
2298   //  if (src == NULL) return -1;
2299   __ testptr(src, src);         // src oop
2300   size_t j1off = __ offset();
2301   __ jccb(Assembler::zero, L_failed_0);
2302 
2303   //  if (src_pos < 0) return -1;
2304   __ testl(src_pos, src_pos); // src_pos (32-bits)
2305   __ jccb(Assembler::negative, L_failed_0);
2306 
2307   //  if (dst == NULL) return -1;
2308   __ testptr(dst, dst);         // dst oop
2309   __ jccb(Assembler::zero, L_failed_0);
2310 
2311   //  if (dst_pos < 0) return -1;
2312   __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2313   size_t j4off = __ offset();
2314   __ jccb(Assembler::negative, L_failed_0);
2315 
2316   // The first four tests are very dense code,
2317   // but not quite dense enough to put four
2318   // jumps in a 16-byte instruction fetch buffer.
2319   // That's good, because some branch predicters
2320   // do not like jumps so close together.
2321   // Make sure of this.
2322   guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2323 
2324   // registers used as temp
2325   const Register r11_length    = r11; // elements count to copy
2326   const Register r10_src_klass = r10; // array klass
2327 
2328   //  if (length < 0) return -1;
2329   __ movl(r11_length, length);        // length (elements count, 32-bits value)
2330   __ testl(r11_length, r11_length);
2331   __ jccb(Assembler::negative, L_failed_0);
2332 
2333   __ load_klass(r10_src_klass, src, rklass_tmp);
2334 #ifdef ASSERT
2335   //  assert(src->klass() != NULL);
2336   {
2337     BLOCK_COMMENT("assert klasses not null {");
2338     Label L1, L2;
2339     __ testptr(r10_src_klass, r10_src_klass);
2340     __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2341     __ bind(L1);
2342     __ stop("broken null klass");
2343     __ bind(L2);
2344     __ load_klass(rax, dst, rklass_tmp);
2345     __ cmpq(rax, 0);
2346     __ jcc(Assembler::equal, L1);     // this would be broken also
2347     BLOCK_COMMENT("} assert klasses not null done");
2348   }
2349 #endif
2350 
2351   // Load layout helper (32-bits)
2352   //
2353   //  |array_tag|     | header_size | element_type |     |log2_element_size|
2354   // 32        30    24            16              8     2                 0
2355   //
2356   //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2357   //
2358 
2359   const int lh_offset = in_bytes(Klass::layout_helper_offset());
2360 
2361   // Handle objArrays completely differently...
2362   const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2363   __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2364   __ jcc(Assembler::equal, L_objArray);
2365 
2366   //  if (src->klass() != dst->klass()) return -1;
2367   __ load_klass(rax, dst, rklass_tmp);
2368   __ cmpq(r10_src_klass, rax);
2369   __ jcc(Assembler::notEqual, L_failed);
2370 
2371   const Register rax_lh = rax;  // layout helper
2372   __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2373 
2374   // Check for flat inline type array -> return -1
2375   __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2376   __ jcc(Assembler::notZero, L_failed);
2377 
2378   // Check for null-free (non-flat) inline type array -> handle as object array
2379   __ testl(rax_lh, Klass::_lh_null_free_array_bit_inplace);
2380   __ jcc(Assembler::notZero, L_objArray);
2381 
2382   //  if (!src->is_Array()) return -1;
2383   __ cmpl(rax_lh, Klass::_lh_neutral_value);
2384   __ jcc(Assembler::greaterEqual, L_failed);
2385 
2386   // At this point, it is known to be a typeArray (array_tag 0x3).
2387 #ifdef ASSERT
2388   {
2389     BLOCK_COMMENT("assert primitive array {");
2390     Label L;
2391     __ movl(rklass_tmp, rax_lh);
2392     __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
2393     __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
2394     __ jcc(Assembler::equal, L);
2395     __ stop("must be a primitive array");
2396     __ bind(L);
2397     BLOCK_COMMENT("} assert primitive array done");
2398   }
2399 #endif
2400 
2401   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2402                          r10, L_failed);
2403 
2404   // TypeArrayKlass
2405   //
2406   // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2407   // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2408   //
2409 
2410   const Register r10_offset = r10;    // array offset
2411   const Register rax_elsize = rax_lh; // element size
2412 
2413   __ movl(r10_offset, rax_lh);
2414   __ shrl(r10_offset, Klass::_lh_header_size_shift);
2415   __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2416   __ addptr(src, r10_offset);           // src array offset
2417   __ addptr(dst, r10_offset);           // dst array offset
2418   BLOCK_COMMENT("choose copy loop based on element size");
2419   __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2420 
2421 #ifdef _WIN64
2422   __ pop(rklass_tmp); // Restore callee-save rdi
2423 #endif
2424 
2425   // next registers should be set before the jump to corresponding stub
2426   const Register from     = c_rarg0;  // source array address
2427   const Register to       = c_rarg1;  // destination array address
2428   const Register count    = c_rarg2;  // elements count
2429 
2430   // 'from', 'to', 'count' registers should be set in such order
2431   // since they are the same as 'src', 'src_pos', 'dst'.
2432 
2433   __ cmpl(rax_elsize, 0);
2434   __ jccb(Assembler::notEqual, L_copy_shorts);
2435   __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2436   __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2437   __ movl2ptr(count, r11_length); // length
2438   __ jump(RuntimeAddress(byte_copy_entry));
2439 
2440 __ BIND(L_copy_shorts);
2441   __ cmpl(rax_elsize, LogBytesPerShort);
2442   __ jccb(Assembler::notEqual, L_copy_ints);
2443   __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2444   __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2445   __ movl2ptr(count, r11_length); // length
2446   __ jump(RuntimeAddress(short_copy_entry));
2447 
2448 __ BIND(L_copy_ints);
2449   __ cmpl(rax_elsize, LogBytesPerInt);
2450   __ jccb(Assembler::notEqual, L_copy_longs);
2451   __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2452   __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2453   __ movl2ptr(count, r11_length); // length
2454   __ jump(RuntimeAddress(int_copy_entry));
2455 
2456 __ BIND(L_copy_longs);
2457 #ifdef ASSERT
2458   {
2459     BLOCK_COMMENT("assert long copy {");
2460     Label L;
2461     __ cmpl(rax_elsize, LogBytesPerLong);
2462     __ jcc(Assembler::equal, L);
2463     __ stop("must be long copy, but elsize is wrong");
2464     __ bind(L);
2465     BLOCK_COMMENT("} assert long copy done");
2466   }
2467 #endif
2468   __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2469   __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2470   __ movl2ptr(count, r11_length); // length
2471   __ jump(RuntimeAddress(long_copy_entry));
2472 
2473   // ObjArrayKlass
2474 __ BIND(L_objArray);
2475   // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2476 
2477   Label L_plain_copy, L_checkcast_copy;
2478   //  test array classes for subtyping
2479   __ load_klass(rax, dst, rklass_tmp);
2480   __ cmpq(r10_src_klass, rax); // usual case is exact equality
2481   __ jcc(Assembler::notEqual, L_checkcast_copy);
2482 
2483   // Identically typed arrays can be copied without element-wise checks.
2484   arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2485                          r10, L_failed);
2486 
2487   __ lea(from, Address(src, src_pos, TIMES_OOP,
2488                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2489   __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2490                arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2491   __ movl2ptr(count, r11_length); // length
2492 __ BIND(L_plain_copy);
2493 #ifdef _WIN64
2494   __ pop(rklass_tmp); // Restore callee-save rdi
2495 #endif
2496   __ jump(RuntimeAddress(oop_copy_entry));
2497 
2498 __ BIND(L_checkcast_copy);
2499   // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2500   {
2501     // Before looking at dst.length, make sure dst is also an objArray.
2502     // This check also fails for flat/null-free arrays which are not supported.
2503     __ cmpl(Address(rax, lh_offset), objArray_lh);
2504     __ jcc(Assembler::notEqual, L_failed);
2505 
2506 #ifdef ASSERT
2507     {
2508       BLOCK_COMMENT("assert not null-free array {");
2509       Label L;
2510       __ movl(rklass_tmp, Address(rax, lh_offset));
2511       __ testl(rklass_tmp, Klass::_lh_null_free_array_bit_inplace);
2512       __ jcc(Assembler::zero, L);
2513       __ stop("unexpected null-free array");
2514       __ bind(L);
2515       BLOCK_COMMENT("} assert not null-free array");
2516     }
2517 #endif
2518 
2519     // It is safe to examine both src.length and dst.length.
2520     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2521                            rax, L_failed);
2522 
2523     const Register r11_dst_klass = r11;
2524     __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
2525 
2526     // Marshal the base address arguments now, freeing registers.
2527     __ lea(from, Address(src, src_pos, TIMES_OOP,
2528                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2529     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2530                  arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2531     __ movl(count, length);           // length (reloaded)
2532     Register sco_temp = c_rarg3;      // this register is free now
2533     assert_different_registers(from, to, count, sco_temp,
2534                                r11_dst_klass, r10_src_klass);
2535     assert_clean_int(count, sco_temp);
2536 
2537     // Generate the type check.
2538     const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2539     __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2540     assert_clean_int(sco_temp, rax);
2541     generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2542 
2543     // Fetch destination element klass from the ObjArrayKlass header.
2544     int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2545     __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2546     __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
2547     assert_clean_int(sco_temp, rax);
2548 
2549 #ifdef _WIN64
2550     __ pop(rklass_tmp); // Restore callee-save rdi
2551 #endif
2552 
2553     // the checkcast_copy loop needs two extra arguments:
2554     assert(c_rarg3 == sco_temp, "#3 already in place");
2555     // Set up arguments for checkcast_copy_entry.
2556     setup_arg_regs(4);
2557     __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2558     __ jump(RuntimeAddress(checkcast_copy_entry));
2559   }
2560 
2561 __ BIND(L_failed);
2562 #ifdef _WIN64
2563   __ pop(rklass_tmp); // Restore callee-save rdi
2564 #endif
2565   __ xorptr(rax, rax);
2566   __ notptr(rax); // return -1
2567   __ leave();   // required for proper stackwalking of RuntimeStub frame
2568   __ ret(0);
2569 
2570   return start;
2571 }
2572 
2573 #undef __