1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2021, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/universe.hpp"
  35 #include "nativeInst_riscv.hpp"
  36 #include "oops/instanceOop.hpp"
  37 #include "oops/method.hpp"
  38 #include "oops/objArrayKlass.hpp"
  39 #include "oops/oop.inline.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "runtime/frame.inline.hpp"
  42 #include "runtime/handles.inline.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubCodeGenerator.hpp"
  45 #include "runtime/stubRoutines.hpp"
  46 #include "runtime/thread.inline.hpp"
  47 #include "utilities/align.hpp"
  48 #include "utilities/powerOfTwo.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 #if INCLUDE_ZGC
  53 #include "gc/z/zThreadLocalData.hpp"
  54 #endif
  55 
  56 // Declaration and definition of StubGenerator (no .hpp file).
  57 // For a more detailed description of the stub routine structure
  58 // see the comment in stubRoutines.hpp
  59 
  60 #undef __
  61 #define __ _masm->
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(int& counter) {
  80     __ la(t1, ExternalAddress((address)&counter));
  81     __ lwu(t0, Address(t1, 0));
  82     __ addiw(t0, t0, 1);
  83     __ sw(t0, Address(t1, 0));
  84   }
  85 #define inc_counter_np(counter) \
  86   BLOCK_COMMENT("inc_counter " #counter); \
  87   inc_counter_np_(counter);
  88 #endif
  89 
  90   // Call stubs are used to call Java from C
  91   //
  92   // Arguments:
  93   //    c_rarg0:   call wrapper address                   address
  94   //    c_rarg1:   result                                 address
  95   //    c_rarg2:   result type                            BasicType
  96   //    c_rarg3:   method                                 Method*
  97   //    c_rarg4:   (interpreter) entry point              address
  98   //    c_rarg5:   parameters                             intptr_t*
  99   //    c_rarg6:   parameter size (in words)              int
 100   //    c_rarg7:   thread                                 Thread*
 101   //
 102   // There is no return from the stub itself as any Java result
 103   // is written to result
 104   //
 105   // we save x1 (lr) as the return PC at the base of the frame and
 106   // link x8 (fp) below it as the frame pointer installing sp (x2)
 107   // into fp.
 108   //
 109   // we save x10-x17, which accounts for all the c arguments.
 110   //
 111   // TODO: strictly do we need to save them all? they are treated as
 112   // volatile by C so could we omit saving the ones we are going to
 113   // place in global registers (thread? method?) or those we only use
 114   // during setup of the Java call?
 115   //
 116   // we don't need to save x5 which C uses as an indirect result location
 117   // return register.
 118   //
 119   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 120   // volatile
 121   //
 122   // we save x18-x27 which Java uses as temporary registers and C
 123   // expects to be callee-save
 124   //
 125   // so the stub frame looks like this when we enter Java code
 126   //
 127   //     [ return_from_Java     ] <--- sp
 128   //     [ argument word n      ]
 129   //      ...
 130   // -21 [ argument word 1      ]
 131   // -20 [ saved x27            ] <--- sp_after_call
 132   // -19 [ saved x26            ]
 133   // -18 [ saved x25            ]
 134   // -17 [ saved x24            ]
 135   // -16 [ saved x23            ]
 136   // -15 [ saved x22            ]
 137   // -14 [ saved x21            ]
 138   // -13 [ saved x20            ]
 139   // -12 [ saved x19            ]
 140   // -11 [ saved x18            ]
 141   // -10 [ saved x9             ]
 142   //  -9 [ thread pointer (x4)  ]
 143   //  -8 [ call wrapper   (x10) ]
 144   //  -7 [ result         (x11) ]
 145   //  -6 [ result type    (x12) ]
 146   //  -5 [ method         (x13) ]
 147   //  -4 [ entry point    (x14) ]
 148   //  -3 [ parameters     (x15) ]
 149   //  -2 [ parameter size (x16) ]
 150   //  -1 [ thread         (x17) ]
 151   //   0 [ saved fp       (x8)  ] <--- fp == saved sp (x2)
 152   //   1 [ saved lr       (x1)  ]
 153 
 154   // Call stub stack layout word offsets from fp
 155   enum call_stub_layout {
 156     sp_after_call_off  = -20,
 157 
 158     x27_off            = -20,
 159     x26_off            = -19,
 160     x25_off            = -18,
 161     x24_off            = -17,
 162     x23_off            = -16,
 163     x22_off            = -15,
 164     x21_off            = -14,
 165     x20_off            = -13,
 166     x19_off            = -12,
 167     x18_off            = -11,
 168     x9_off             = -10,
 169 
 170     x4_off             =  -9,
 171 
 172     call_wrapper_off   =  -8,
 173     result_off         =  -7,
 174     result_type_off    =  -6,
 175     method_off         =  -5,
 176     entry_point_off    =  -4,
 177     parameters_off     =  -3,
 178     parameter_size_off =  -2,
 179     thread_off         =  -1,
 180     fp_f               =   0,
 181     retaddr_off        =   1,
 182   };
 183 
 184   address generate_call_stub(address& return_address) {
 185     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 186            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 187            "adjust this code");
 188 
 189     StubCodeMark mark(this, "StubRoutines", "call_stub");
 190     address start = __ pc();
 191 
 192     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 193 
 194     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 195     const Address result        (fp, result_off         * wordSize);
 196     const Address result_type   (fp, result_type_off    * wordSize);
 197     const Address method        (fp, method_off         * wordSize);
 198     const Address entry_point   (fp, entry_point_off    * wordSize);
 199     const Address parameters    (fp, parameters_off     * wordSize);
 200     const Address parameter_size(fp, parameter_size_off * wordSize);
 201 
 202     const Address thread        (fp, thread_off         * wordSize);
 203 
 204     const Address x27_save      (fp, x27_off            * wordSize);
 205     const Address x26_save      (fp, x26_off            * wordSize);
 206     const Address x25_save      (fp, x25_off            * wordSize);
 207     const Address x24_save      (fp, x24_off            * wordSize);
 208     const Address x23_save      (fp, x23_off            * wordSize);
 209     const Address x22_save      (fp, x22_off            * wordSize);
 210     const Address x21_save      (fp, x21_off            * wordSize);
 211     const Address x20_save      (fp, x20_off            * wordSize);
 212     const Address x19_save      (fp, x19_off            * wordSize);
 213     const Address x18_save      (fp, x18_off            * wordSize);
 214 
 215     const Address x9_save       (fp, x9_off             * wordSize);
 216     const Address x4_save       (fp, x4_off             * wordSize);
 217 
 218     // stub code
 219 
 220     address riscv64_entry = __ pc();
 221 
 222     // set up frame and move sp to end of save area
 223     __ enter();
 224     __ addi(sp, fp, sp_after_call_off * wordSize);
 225 
 226     // save register parameters and Java temporary/global registers
 227     // n.b. we save thread even though it gets installed in
 228     // xthread because we want to sanity check tp later
 229     __ sd(c_rarg7, thread);
 230     __ sw(c_rarg6, parameter_size);
 231     __ sd(c_rarg5, parameters);
 232     __ sd(c_rarg4, entry_point);
 233     __ sd(c_rarg3, method);
 234     __ sd(c_rarg2, result_type);
 235     __ sd(c_rarg1, result);
 236     __ sd(c_rarg0, call_wrapper);
 237 
 238     __ sd(x4, x4_save);
 239     __ sd(x9, x9_save);
 240 
 241     __ sd(x18, x18_save);
 242     __ sd(x19, x19_save);
 243     __ sd(x20, x20_save);
 244     __ sd(x21, x21_save);
 245     __ sd(x22, x22_save);
 246     __ sd(x23, x23_save);
 247     __ sd(x24, x24_save);
 248     __ sd(x25, x25_save);
 249     __ sd(x26, x26_save);
 250     __ sd(x27, x27_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mv(xthread, c_rarg7);
 255 
 256     // And method
 257     __ mv(xmethod, c_rarg3);
 258 
 259     // set up the heapbase register
 260     __ reinit_heapbase();
 261 
 262 #ifdef ASSERT
 263     // make sure we have no pending exceptions
 264     {
 265       Label L;
 266       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 267       __ beqz(t0, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mv(esp, sp);
 274     __ slli(t0, c_rarg6, LogBytesPerWord);
 275     __ sub(t0, sp, t0); // Move SP out of the way
 276     __ andi(sp, t0, -2 * wordSize);
 277 
 278     BLOCK_COMMENT("pass parameters if any");
 279     Label parameters_done;
 280     // parameter count is still in c_rarg6
 281     // and parameter pointer identifying param 1 is in c_rarg5
 282     __ beqz(c_rarg6, parameters_done);
 283 
 284     address loop = __ pc();
 285     __ ld(t0, c_rarg5, 0);
 286     __ addi(c_rarg5, c_rarg5, wordSize);
 287     __ addi(c_rarg6, c_rarg6, -1);
 288     __ push_reg(t0);
 289     __ bgtz(c_rarg6, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      xmethod: Method*
 295     //      x30: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mv(x30, sp);
 298     __ jalr(c_rarg4);
 299 
 300     // save current address for use by exception handling code
 301 
 302     return_address = __ pc();
 303 
 304     // store result depending on type (everything that is not
 305     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 306     // n.b. this assumes Java returns an integral result in x10
 307     // and a floating result in j_farg0
 308     __ ld(j_rarg2, result);
 309     Label is_long, is_float, is_double, exit;
 310     __ ld(j_rarg1, result_type);
 311     __ li(t0, (u1)T_OBJECT);
 312     __ beq(j_rarg1, t0, is_long);
 313     __ li(t0, (u1)T_LONG);
 314     __ beq(j_rarg1, t0, is_long);
 315     __ li(t0, (u1)T_FLOAT);
 316     __ beq(j_rarg1, t0, is_float);
 317     __ li(t0, (u1)T_DOUBLE);
 318     __ beq(j_rarg1, t0, is_double);
 319 
 320     // handle T_INT case
 321     __ sw(x10, Address(j_rarg2));
 322 
 323     __ BIND(exit);
 324 
 325     // pop parameters
 326     __ addi(esp, fp, sp_after_call_off * wordSize);
 327 
 328 #ifdef ASSERT
 329     // verify that threads correspond
 330     {
 331       Label L, S;
 332       __ ld(t0, thread);
 333       __ bne(xthread, t0, S);
 334       __ get_thread(t0);
 335       __ beq(xthread, t0, L);
 336       __ BIND(S);
 337       __ stop("StubRoutines::call_stub: threads must correspond");
 338       __ BIND(L);
 339     }
 340 #endif
 341 
 342     // restore callee-save registers
 343     __ ld(x27, x27_save);
 344     __ ld(x26, x26_save);
 345     __ ld(x25, x25_save);
 346     __ ld(x24, x24_save);
 347     __ ld(x23, x23_save);
 348     __ ld(x22, x22_save);
 349     __ ld(x21, x21_save);
 350     __ ld(x20, x20_save);
 351     __ ld(x19, x19_save);
 352     __ ld(x18, x18_save);
 353 
 354     __ ld(x9, x9_save);
 355     __ ld(x4, x4_save);
 356 
 357     __ ld(c_rarg0, call_wrapper);
 358     __ ld(c_rarg1, result);
 359     __ ld(c_rarg2, result_type);
 360     __ ld(c_rarg3, method);
 361     __ ld(c_rarg4, entry_point);
 362     __ ld(c_rarg5, parameters);
 363     __ ld(c_rarg6, parameter_size);
 364     __ ld(c_rarg7, thread);
 365 
 366     // leave frame and return to caller
 367     __ leave();
 368     __ ret();
 369 
 370     // handle return types different from T_INT
 371 
 372     __ BIND(is_long);
 373     __ sd(x10, Address(j_rarg2, 0));
 374     __ j(exit);
 375 
 376     __ BIND(is_float);
 377     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 378     __ j(exit);
 379 
 380     __ BIND(is_double);
 381     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 382     __ j(exit);
 383 
 384     return start;
 385   }
 386 
 387   // Return point for a Java call if there's an exception thrown in
 388   // Java code.  The exception is caught and transformed into a
 389   // pending exception stored in JavaThread that can be tested from
 390   // within the VM.
 391   //
 392   // Note: Usually the parameters are removed by the callee. In case
 393   // of an exception crossing an activation frame boundary, that is
 394   // not the case if the callee is compiled code => need to setup the
 395   // rsp.
 396   //
 397   // x10: exception oop
 398 
 399   address generate_catch_exception() {
 400     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 401     address start = __ pc();
 402 
 403     // same as in generate_call_stub():
 404     const Address thread(fp, thread_off * wordSize);
 405 
 406 #ifdef ASSERT
 407     // verify that threads correspond
 408     {
 409       Label L, S;
 410       __ ld(t0, thread);
 411       __ bne(xthread, t0, S);
 412       __ get_thread(t0);
 413       __ beq(xthread, t0, L);
 414       __ bind(S);
 415       __ stop("StubRoutines::catch_exception: threads must correspond");
 416       __ bind(L);
 417     }
 418 #endif
 419 
 420     // set pending exception
 421     __ verify_oop(x10);
 422 
 423     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 424     __ mv(t0, (address)__FILE__);
 425     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 426     __ mv(t0, (int)__LINE__);
 427     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 428 
 429     // complete return to VM
 430     assert(StubRoutines::_call_stub_return_address != NULL,
 431            "_call_stub_return_address must have been generated before");
 432     __ j(StubRoutines::_call_stub_return_address);
 433 
 434     return start;
 435   }
 436 
 437   // Continuation point for runtime calls returning with a pending
 438   // exception.  The pending exception check happened in the runtime
 439   // or native call stub.  The pending exception in Thread is
 440   // converted into a Java-level exception.
 441   //
 442   // Contract with Java-level exception handlers:
 443   // x10: exception
 444   // x13: throwing pc
 445   //
 446   // NOTE: At entry of this stub, exception-pc must be in LR !!
 447 
 448   // NOTE: this is always used as a jump target within generated code
 449   // so it just needs to be generated code with no x86 prolog
 450 
 451   address generate_forward_exception() {
 452     StubCodeMark mark(this, "StubRoutines", "forward exception");
 453     address start = __ pc();
 454 
 455     // Upon entry, LR points to the return address returning into
 456     // Java (interpreted or compiled) code; i.e., the return address
 457     // becomes the throwing pc.
 458     //
 459     // Arguments pushed before the runtime call are still on the stack
 460     // but the exception handler will reset the stack pointer ->
 461     // ignore them.  A potential result in registers can be ignored as
 462     // well.
 463 
 464 #ifdef ASSERT
 465     // make sure this code is only executed if there is a pending exception
 466     {
 467       Label L;
 468       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 469       __ bnez(t0, L);
 470       __ stop("StubRoutines::forward exception: no pending exception (1)");
 471       __ bind(L);
 472     }
 473 #endif
 474 
 475     // compute exception handler into x9
 476 
 477     // call the VM to find the handler address associated with the
 478     // caller address. pass thread in x10 and caller pc (ret address)
 479     // in x11. n.b. the caller pc is in lr, unlike x86 where it is on
 480     // the stack.
 481     __ mv(c_rarg1, lr);
 482     // lr will be trashed by the VM call so we move it to x9
 483     // (callee-saved) because we also need to pass it to the handler
 484     // returned by this call.
 485     __ mv(x9, lr);
 486     BLOCK_COMMENT("call exception_handler_for_return_address");
 487     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 488                          SharedRuntime::exception_handler_for_return_address),
 489                     xthread, c_rarg1);
 490     // we should not really care that lr is no longer the callee
 491     // address. we saved the value the handler needs in x9 so we can
 492     // just copy it to x13. however, the C2 handler will push its own
 493     // frame and then calls into the VM and the VM code asserts that
 494     // the PC for the frame above the handler belongs to a compiled
 495     // Java method. So, we restore lr here to satisfy that assert.
 496     __ mv(lr, x9);
 497     // setup x10 & x13 & clear pending exception
 498     __ mv(x13, x9);
 499     __ mv(x9, x10);
 500     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 501     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 502 
 503 #ifdef ASSERT
 504     // make sure exception is set
 505     {
 506       Label L;
 507       __ bnez(x10, L);
 508       __ stop("StubRoutines::forward exception: no pending exception (2)");
 509       __ bind(L);
 510     }
 511 #endif
 512 
 513     // continue at exception handler
 514     // x10: exception
 515     // x13: throwing pc
 516     // x9: exception handler
 517     __ verify_oop(x10);
 518     __ jr(x9);
 519 
 520     return start;
 521   }
 522 
 523   // Non-destructive plausibility checks for oops
 524   //
 525   // Arguments:
 526   //    x10: oop to verify
 527   //    t0: error message
 528   //
 529   // Stack after saving c_rarg3:
 530   //    [tos + 0]: saved c_rarg3
 531   //    [tos + 1]: saved c_rarg2
 532   //    [tos + 2]: saved lr
 533   //    [tos + 3]: saved t1
 534   //    [tos + 4]: saved x10
 535   //    [tos + 5]: saved t0
 536   address generate_verify_oop() {
 537 
 538     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 539     address start = __ pc();
 540 
 541     Label exit, error;
 542 
 543     __ push_reg(0x3000, sp);   // save c_rarg2 and c_rarg3
 544 
 545     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 546     __ ld(c_rarg3, Address(c_rarg2));
 547     __ add(c_rarg3, c_rarg3, 1);
 548     __ sd(c_rarg3, Address(c_rarg2));
 549 
 550     // object is in x10
 551     // make sure object is 'reasonable'
 552     __ beqz(x10, exit); // if obj is NULL it is OK
 553 
 554 #if INCLUDE_ZGC
 555     if (UseZGC) {
 556       // Check if mask is good.
 557       // verifies that ZAddressBadMask & x10 == 0
 558       __ ld(c_rarg3, Address(xthread, ZThreadLocalData::address_bad_mask_offset()));
 559       __ andr(c_rarg2, x10, c_rarg3);
 560       __ bnez(c_rarg2, error);
 561     }
 562 #endif
 563 
 564     // Check if the oop is in the right area of memory
 565     __ mv(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 566     __ andr(c_rarg2, x10, c_rarg3);
 567     __ mv(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 568 
 569     // Compare c_rarg2 and c_rarg3.
 570     __ bne(c_rarg2, c_rarg3, error);
 571 
 572     // make sure klass is 'reasonable', which is not zero.
 573     __ load_klass(x10, x10);  // get klass
 574     __ beqz(x10, error);      // if klass is NULL it is broken
 575 
 576     // return if everything seems ok
 577     __ bind(exit);
 578 
 579     __ pop_reg(0x3000, sp);   // pop c_rarg2 and c_rarg3
 580     __ ret();
 581 
 582     // handle errors
 583     __ bind(error);
 584     __ pop_reg(0x3000, sp);   // pop c_rarg2 and c_rarg3
 585 
 586     __ pusha();
 587     // debug(char* msg, int64_t pc, int64_t regs[])
 588     __ mv(c_rarg0, t0);             // pass address of error message
 589     __ mv(c_rarg1, lr);             // pass return address
 590     __ mv(c_rarg2, sp);             // pass address of regs on stack
 591 #ifndef PRODUCT
 592     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 593 #endif
 594     BLOCK_COMMENT("call MacroAssembler::debug");
 595     int32_t offset = 0;
 596     __ movptr_with_offset(t0, CAST_FROM_FN_PTR(address, MacroAssembler::debug64), offset);
 597     __ jalr(x1, t0, offset);
 598     __ ebreak();
 599 
 600     return start;
 601   }
 602 
 603   // The inner part of zero_words().
 604   //
 605   // Inputs:
 606   // x28: the HeapWord-aligned base address of an array to zero.
 607   // x29: the count in HeapWords, x29 > 0.
 608   //
 609   // Returns x28 and x29, adjusted for the caller to clear.
 610   // x28: the base address of the tail of words left to clear.
 611   // x29: the number of words in the tail.
 612   //      x29 < MacroAssembler::zero_words_block_size.
 613 
 614   address generate_zero_blocks() {
 615     Label done;
 616 
 617     const Register base = x28, cnt = x29;
 618 
 619     __ align(CodeEntryAlignment);
 620     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 621     address start = __ pc();
 622 
 623     {
 624       // Clear the remaining blocks.
 625       Label loop;
 626       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 627       __ bltz(cnt, done);
 628       __ bind(loop);
 629       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 630         __ sd(zr, Address(base, 0));
 631         __ add(base, base, 8);
 632       }
 633       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 634       __ bgez(cnt, loop);
 635       __ bind(done);
 636       __ add(cnt, cnt, MacroAssembler::zero_words_block_size);
 637     }
 638 
 639     __ ret();
 640 
 641     return start;
 642   }
 643 
 644   typedef enum {
 645     copy_forwards = 1,
 646     copy_backwards = -1
 647   } copy_direction;
 648 
 649   // Bulk copy of blocks of 8 words.
 650   //
 651   // count is a count of words.
 652   //
 653   // Precondition: count >= 8
 654   //
 655   // Postconditions:
 656   //
 657   // The least significant bit of count contains the remaining count
 658   // of words to copy.  The rest of count is trash.
 659   //
 660   // s and d are adjusted to point to the remaining words to copy
 661   //
 662   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 663                            copy_direction direction) {
 664     int unit = wordSize * direction;
 665     int bias = wordSize;
 666 
 667     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 668       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 669 
 670     const Register stride = x30;
 671 
 672     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 673       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 674     assert_different_registers(s, d, count, t0);
 675 
 676     Label again, drain;
 677     const char* stub_name = NULL;
 678     if (direction == copy_forwards) {
 679       stub_name = "forward_copy_longs";
 680     } else {
 681       stub_name = "backward_copy_longs";
 682     }
 683     StubCodeMark mark(this, "StubRoutines", stub_name);
 684     __ align(CodeEntryAlignment);
 685     __ bind(start);
 686 
 687     if (direction == copy_forwards) {
 688       __ sub(s, s, bias);
 689       __ sub(d, d, bias);
 690     }
 691 
 692 #ifdef ASSERT
 693     // Make sure we are never given < 8 words
 694     {
 695       Label L;
 696 
 697       __ li(t0, 8);
 698       __ bge(count, t0, L);
 699       __ stop("genrate_copy_longs called with < 8 words");
 700       __ bind(L);
 701     }
 702 #endif
 703 
 704     __ ld(tmp_reg0, Address(s, 1 * unit));
 705     __ ld(tmp_reg1, Address(s, 2 * unit));
 706     __ ld(tmp_reg2, Address(s, 3 * unit));
 707     __ ld(tmp_reg3, Address(s, 4 * unit));
 708     __ ld(tmp_reg4, Address(s, 5 * unit));
 709     __ ld(tmp_reg5, Address(s, 6 * unit));
 710     __ ld(tmp_reg6, Address(s, 7 * unit));
 711     __ ld(tmp_reg7, Address(s, 8 * unit));
 712     __ addi(s, s, 8 * unit);
 713 
 714     __ sub(count, count, 16);
 715     __ bltz(count, drain);
 716 
 717     __ bind(again);
 718 
 719     __ sd(tmp_reg0, Address(d, 1 * unit));
 720     __ sd(tmp_reg1, Address(d, 2 * unit));
 721     __ sd(tmp_reg2, Address(d, 3 * unit));
 722     __ sd(tmp_reg3, Address(d, 4 * unit));
 723     __ sd(tmp_reg4, Address(d, 5 * unit));
 724     __ sd(tmp_reg5, Address(d, 6 * unit));
 725     __ sd(tmp_reg6, Address(d, 7 * unit));
 726     __ sd(tmp_reg7, Address(d, 8 * unit));
 727 
 728     __ ld(tmp_reg0, Address(s, 1 * unit));
 729     __ ld(tmp_reg1, Address(s, 2 * unit));
 730     __ ld(tmp_reg2, Address(s, 3 * unit));
 731     __ ld(tmp_reg3, Address(s, 4 * unit));
 732     __ ld(tmp_reg4, Address(s, 5 * unit));
 733     __ ld(tmp_reg5, Address(s, 6 * unit));
 734     __ ld(tmp_reg6, Address(s, 7 * unit));
 735     __ ld(tmp_reg7, Address(s, 8 * unit));
 736 
 737     __ addi(s, s, 8 * unit);
 738     __ addi(d, d, 8 * unit);
 739 
 740     __ sub(count, count, 8);
 741     __ bgez(count, again);
 742 
 743     // Drain
 744     __ bind(drain);
 745 
 746     __ sd(tmp_reg0, Address(d, 1 * unit));
 747     __ sd(tmp_reg1, Address(d, 2 * unit));
 748     __ sd(tmp_reg2, Address(d, 3 * unit));
 749     __ sd(tmp_reg3, Address(d, 4 * unit));
 750     __ sd(tmp_reg4, Address(d, 5 * unit));
 751     __ sd(tmp_reg5, Address(d, 6 * unit));
 752     __ sd(tmp_reg6, Address(d, 7 * unit));
 753     __ sd(tmp_reg7, Address(d, 8 * unit));
 754     __ addi(d, d, 8 * unit);
 755 
 756     {
 757       Label L1, L2;
 758       __ andi(t0, count, 4);
 759       __ beqz(t0, L1);
 760 
 761       __ ld(tmp_reg0, Address(s, 1 * unit));
 762       __ ld(tmp_reg1, Address(s, 2 * unit));
 763       __ ld(tmp_reg2, Address(s, 3 * unit));
 764       __ ld(tmp_reg3, Address(s, 4 * unit));
 765       __ addi(s, s, 4 * unit);
 766 
 767       __ sd(tmp_reg0, Address(d, 1 * unit));
 768       __ sd(tmp_reg1, Address(d, 2 * unit));
 769       __ sd(tmp_reg2, Address(d, 3 * unit));
 770       __ sd(tmp_reg3, Address(d, 4 * unit));
 771       __ addi(d, d, 4 * unit);
 772 
 773       __ bind(L1);
 774 
 775       if (direction == copy_forwards) {
 776         __ addi(s, s, bias);
 777         __ addi(d, d, bias);
 778       }
 779 
 780       __ andi(t0, count, 2);
 781       __ beqz(t0, L2);
 782       if (direction == copy_backwards) {
 783         __ addi(s, s, 2 * unit);
 784         __ ld(tmp_reg0, Address(s));
 785         __ ld(tmp_reg1, Address(s, wordSize));
 786         __ addi(d, d, 2 * unit);
 787         __ sd(tmp_reg0, Address(d));
 788         __ sd(tmp_reg1, Address(d, wordSize));
 789       } else {
 790         __ ld(tmp_reg0, Address(s));
 791         __ ld(tmp_reg1, Address(s, wordSize));
 792         __ addi(s, s, 2 * unit);
 793         __ sd(tmp_reg0, Address(d));
 794         __ sd(tmp_reg1, Address(d, wordSize));
 795         __ addi(d, d, 2 * unit);
 796       }
 797       __ bind(L2);
 798     }
 799 
 800     __ ret();
 801   }
 802 
 803   Label copy_f, copy_b;
 804 
 805   // All-singing all-dancing memory copy.
 806   //
 807   // Copy count units of memory from s to d.  The size of a unit is
 808   // step, which can be positive or negative depending on the direction
 809   // of copy.  If is_aligned is false, we align the source address.
 810   //
 811   /*
 812    * if (is_aligned) {
 813    *   goto copy_8_bytes;
 814    * }
 815    * bool is_backwards = step < 0;
 816    * int granularity = uabs(step);
 817    * count = count  *  granularity;   * count bytes
 818    *
 819    * if (is_backwards) {
 820    *   s += count;
 821    *   d += count;
 822    * }
 823    *
 824    * count limit maybe greater than 16, for better performance
 825    * if (count < 16) {
 826    *   goto copy_small;
 827    * }
 828    *
 829    * if ((dst % 8) == (src % 8)) {
 830    *   aligned;
 831    *   goto copy8;
 832    * }
 833    *
 834    * copy_small:
 835    *   load element one by one;
 836    * done;
 837    */
 838 
 839   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 840 
 841   void copy_memory_v(Register s, Register d, Register count, Register tmp, int step) {
 842     bool is_backward = step < 0;
 843     int granularity = uabs(step);
 844 
 845     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 846     assert_different_registers(s, d, cnt, vl, tmp, tmp1, tmp2);
 847     Assembler::SEW sew = Assembler::elemBytes_to_sew(granularity);
 848     assert(sew >= Assembler::e8 && sew <= Assembler::e64, "illegal SEW");
 849     Label loop_forward, loop_backward, done;
 850 
 851     __ mv(dst, d);
 852     __ mv(src, s);
 853     __ mv(cnt, count);
 854 
 855     __ bind(loop_forward);
 856     __ vsetvli(vl, cnt, sew, Assembler::m8);
 857     if (is_backward) {
 858       __ bne(vl, cnt, loop_backward);
 859     }
 860 
 861     __ vlex_v(v0, src, sew);
 862     __ sub(cnt, cnt, vl);
 863     __ slli(vl, vl, (int)sew);
 864     __ add(src, src, vl);
 865 
 866     __ vsex_v(v0, dst, sew);
 867     __ add(dst, dst, vl);
 868     __ bnez(cnt, loop_forward);
 869 
 870     if (is_backward) {
 871       __ j(done);
 872 
 873       __ bind(loop_backward);
 874       __ sub(tmp, cnt, vl);
 875       __ slli(tmp, tmp, sew);
 876       __ add(tmp1, s, tmp);
 877       __ vlex_v(v0, tmp1, sew);
 878       __ add(tmp2, d, tmp);
 879       __ vsex_v(v0, tmp2, sew);
 880       __ sub(cnt, cnt, vl);
 881       __ bnez(cnt, loop_forward);
 882       __ bind(done);
 883     }
 884   }
 885 
 886   void copy_memory(bool is_aligned, Register s, Register d,
 887                    Register count, Register tmp, int step) {
 888     if (UseVExt) {
 889       return copy_memory_v(s, d, count, tmp, step);
 890     }
 891 
 892     bool is_backwards = step < 0;
 893     int granularity = uabs(step);
 894 
 895     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17;
 896 
 897     Label same_aligned;
 898     Label copy8, copy_small, done;
 899 
 900     copy_insn ld_arr = NULL, st_arr = NULL;
 901     switch (granularity) {
 902       case 1 :
 903         ld_arr = (copy_insn)&MacroAssembler::lbu;
 904         st_arr = (copy_insn)&MacroAssembler::sb;
 905         break;
 906       case 2 :
 907         ld_arr = (copy_insn)&MacroAssembler::lhu;
 908         st_arr = (copy_insn)&MacroAssembler::sh;
 909         break;
 910       case 4 :
 911         ld_arr = (copy_insn)&MacroAssembler::lwu;
 912         st_arr = (copy_insn)&MacroAssembler::sw;
 913         break;
 914       case 8 :
 915         ld_arr = (copy_insn)&MacroAssembler::ld;
 916         st_arr = (copy_insn)&MacroAssembler::sd;
 917         break;
 918       default :
 919         ShouldNotReachHere();
 920     }
 921 
 922     __ beqz(count, done);
 923     __ slli(cnt, count, log2i_exact(granularity));
 924     if (is_backwards) {
 925       __ add(src, s, cnt);
 926       __ add(dst, d, cnt);
 927     } else {
 928       __ mv(src, s);
 929       __ mv(dst, d);
 930     }
 931 
 932     if (is_aligned) {
 933       __ addi(tmp, cnt, -8);
 934       __ bgez(tmp, copy8);
 935       __ j(copy_small);
 936     }
 937 
 938     __ mv(tmp, 16);
 939     __ blt(cnt, tmp, copy_small);
 940 
 941     __ xorr(tmp, src, dst);
 942     __ andi(tmp, tmp, 0b111);
 943     __ bnez(tmp, copy_small);
 944 
 945     __ bind(same_aligned);
 946     __ andi(tmp, src, 0b111);
 947     __ beqz(tmp, copy8);
 948     if (is_backwards) {
 949       __ addi(src, src, step);
 950       __ addi(dst, dst, step);
 951     }
 952     (_masm->*ld_arr)(tmp3, Address(src), t0);
 953     (_masm->*st_arr)(tmp3, Address(dst), t0);
 954     if (!is_backwards) {
 955       __ addi(src, src, step);
 956       __ addi(dst, dst, step);
 957     }
 958     __ addi(cnt, cnt, -granularity);
 959     __ beqz(cnt, done);
 960     __ j(same_aligned);
 961 
 962     __ bind(copy8);
 963     if (is_backwards) {
 964       __ addi(src, src, -wordSize);
 965       __ addi(dst, dst, -wordSize);
 966     }
 967     __ ld(tmp3, Address(src));
 968     __ sd(tmp3, Address(dst));
 969     if (!is_backwards) {
 970       __ addi(src, src, wordSize);
 971       __ addi(dst, dst, wordSize);
 972     }
 973     __ addi(cnt, cnt, -wordSize);
 974     __ addi(tmp4, cnt, -8);
 975     __ bgez(tmp4, copy8); // cnt >= 8, do next loop
 976 
 977     __ beqz(cnt, done);
 978 
 979     __ bind(copy_small);
 980     if (is_backwards) {
 981       __ addi(src, src, step);
 982       __ addi(dst, dst, step);
 983     }
 984     (_masm->*ld_arr)(tmp3, Address(src), t0);
 985     (_masm->*st_arr)(tmp3, Address(dst), t0);
 986     if (!is_backwards) {
 987       __ addi(src, src, step);
 988       __ addi(dst, dst, step);
 989     }
 990     __ addi(cnt, cnt, -granularity);
 991     __ bgtz(cnt, copy_small);
 992 
 993     __ bind(done);
 994   }
 995 
 996   // Scan over array at a for count oops, verifying each one.
 997   // Preserves a and count, clobbers t0 and t1.
 998   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
 999     Label loop, end;
1000     __ mv(t1, zr);
1001     __ slli(t0, count, log2i_exact(size));
1002     __ bind(loop);
1003     __ bgeu(t1, t0, end);
1004 
1005     __ add(temp, a, t1);
1006     if (size == (size_t)wordSize) {
1007       __ ld(temp, Address(temp, 0));
1008       __ verify_oop(temp);
1009     } else {
1010       __ lwu(temp, Address(temp, 0));
1011       __ decode_heap_oop(temp); // calls verify_oop
1012     }
1013     __ add(t1, t1, size);
1014     __ j(loop);
1015     __ bind(end);
1016   }
1017 
1018   // Arguments:
1019   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1020   //             ignored
1021   //   is_oop  - true => oop array, so generate store check code
1022   //   name    - stub name string
1023   //
1024   // Inputs:
1025   //   c_rarg0   - source array address
1026   //   c_rarg1   - destination array address
1027   //   c_rarg2   - element count, treated as ssize_t, can be zero
1028   //
1029   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1030   // the hardware handle it.  The two dwords within qwords that span
1031   // cache line boundaries will still be loaded and stored atomicly.
1032   //
1033   // Side Effects:
1034   //   disjoint_int_copy_entry is set to the no-overlap entry point
1035   //   used by generate_conjoint_int_oop_copy().
1036   //
1037   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry,
1038                                  const char* name, bool dest_uninitialized = false) {
1039     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1040     RegSet saved_reg = RegSet::of(s, d, count);
1041     __ align(CodeEntryAlignment);
1042     StubCodeMark mark(this, "StubRoutines", name);
1043     address start = __ pc();
1044     __ enter();
1045 
1046     if (entry != NULL) {
1047       *entry = __ pc();
1048       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1049       BLOCK_COMMENT("Entry:");
1050     }
1051 
1052     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1053     if (dest_uninitialized) {
1054       decorators |= IS_DEST_UNINITIALIZED;
1055     }
1056     if (aligned) {
1057       decorators |= ARRAYCOPY_ALIGNED;
1058     }
1059 
1060     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1061     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1062 
1063     if (is_oop) {
1064       // save regs before copy_memory
1065       __ push_reg(RegSet::of(d, count), sp);
1066     }
1067 
1068     {
1069       // UnsafeCopyMemory page error: continue after ucm
1070       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1071       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1072       copy_memory(aligned, s, d, count, t0, size);
1073     }
1074 
1075     if (is_oop) {
1076       __ pop_reg(RegSet::of(d, count), sp);
1077       if (VerifyOops) {
1078         verify_oop_array(size, d, count, t2);
1079       }
1080     }
1081 
1082     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1083 
1084     __ leave();
1085     __ mv(x10, zr); // return 0
1086     __ ret();
1087     return start;
1088   }
1089 
1090   // Arguments:
1091   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1092   //             ignored
1093   //   is_oop  - true => oop array, so generate store check code
1094   //   name    - stub name string
1095   //
1096   // Inputs:
1097   //   c_rarg0   - source array address
1098   //   c_rarg1   - destination array address
1099   //   c_rarg2   - element count, treated as ssize_t, can be zero
1100   //
1101   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1102   // the hardware handle it.  The two dwords within qwords that span
1103   // cache line boundaries will still be loaded and stored atomicly.
1104   //
1105   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1106                                  address* entry, const char* name,
1107                                  bool dest_uninitialized = false) {
1108     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1109     RegSet saved_regs = RegSet::of(s, d, count);
1110     StubCodeMark mark(this, "StubRoutines", name);
1111     address start = __ pc();
1112     __ enter();
1113 
1114     if (entry != NULL) {
1115       *entry = __ pc();
1116       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1117       BLOCK_COMMENT("Entry:");
1118     }
1119 
1120     // use fwd copy when (d-s) above_equal (count*size)
1121     __ sub(t0, d, s);
1122     __ slli(t1, count, log2i_exact(size));
1123     __ bgeu(t0, t1, nooverlap_target);
1124 
1125     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1126     if (dest_uninitialized) {
1127       decorators |= IS_DEST_UNINITIALIZED;
1128     }
1129     if (aligned) {
1130       decorators |= ARRAYCOPY_ALIGNED;
1131     }
1132 
1133     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1134     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1135 
1136     if (is_oop) {
1137       // save regs before copy_memory
1138       __ push_reg(RegSet::of(d, count), sp);
1139     }
1140 
1141     {
1142       // UnsafeCopyMemory page error: continue after ucm
1143       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1144       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1145       copy_memory(aligned, s, d, count, t0, -size);
1146     }
1147 
1148     if (is_oop) {
1149       __ pop_reg(RegSet::of(d, count), sp);
1150       if (VerifyOops) {
1151         verify_oop_array(size, d, count, t2);
1152       }
1153     }
1154     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1155     __ leave();
1156     __ mv(x10, zr); // return 0
1157     __ ret();
1158     return start;
1159   }
1160 
1161   // Arguments:
1162   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1163   //             ignored
1164   //   name    - stub name string
1165   //
1166   // Inputs:
1167   //   c_rarg0   - source array address
1168   //   c_rarg1   - destination array address
1169   //   c_rarg2   - element count, treated as ssize_t, can be zero
1170   //
1171   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1172   // we let the hardware handle it.  The one to eight bytes within words,
1173   // dwords or qwords that span cache line boundaries will still be loaded
1174   // and stored atomically.
1175   //
1176   // Side Effects:
1177   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1178   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1179   // we let the hardware handle it.  The one to eight bytes within words,
1180   // dwords or qwords that span cache line boundaries will still be loaded
1181   // and stored atomically.
1182   //
1183   // Side Effects:
1184   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1185   //   used by generate_conjoint_byte_copy().
1186   //
1187   address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) {
1188     const bool not_oop = false;
1189     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1190   }
1191 
1192   // Arguments:
1193   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1194   //             ignored
1195   //   name    - stub name string
1196   //
1197   // Inputs:
1198   //   c_rarg0   - source array address
1199   //   c_rarg1   - destination array address
1200   //   c_rarg2   - element count, treated as ssize_t, can be zero
1201   //
1202   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1203   // we let the hardware handle it.  The one to eight bytes within words,
1204   // dwords or qwords that span cache line boundaries will still be loaded
1205   // and stored atomically.
1206   //
1207   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1208                                       address* entry, const char* name) {
1209     const bool not_oop = false;
1210     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1211   }
1212 
1213   // Arguments:
1214   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1215   //             ignored
1216   //   name    - stub name string
1217   //
1218   // Inputs:
1219   //   c_rarg0   - source array address
1220   //   c_rarg1   - destination array address
1221   //   c_rarg2   - element count, treated as ssize_t, can be zero
1222   //
1223   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1224   // let the hardware handle it.  The two or four words within dwords
1225   // or qwords that span cache line boundaries will still be loaded
1226   // and stored atomically.
1227   //
1228   // Side Effects:
1229   //   disjoint_short_copy_entry is set to the no-overlap entry point
1230   //   used by generate_conjoint_short_copy().
1231   //
1232   address generate_disjoint_short_copy(bool aligned,
1233                                        address* entry, const char* name) {
1234     const bool not_oop = false;
1235     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1236   }
1237 
1238   // Arguments:
1239   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1240   //             ignored
1241   //   name    - stub name string
1242   //
1243   // Inputs:
1244   //   c_rarg0   - source array address
1245   //   c_rarg1   - destination array address
1246   //   c_rarg2   - element count, treated as ssize_t, can be zero
1247   //
1248   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1249   // let the hardware handle it.  The two or four words within dwords
1250   // or qwords that span cache line boundaries will still be loaded
1251   // and stored atomically.
1252   //
1253   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1254                                        address* entry, const char* name) {
1255     const bool not_oop = false;
1256     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1257   }
1258 
1259   // Arguments:
1260   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1261   //             ignored
1262   //   name    - stub name string
1263   //
1264   // Inputs:
1265   //   c_rarg0   - source array address
1266   //   c_rarg1   - destination array address
1267   //   c_rarg2   - element count, treated as ssize_t, can be zero
1268   //
1269   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1270   // the hardware handle it.  The two dwords within qwords that span
1271   // cache line boundaries will still be loaded and stored atomicly.
1272   //
1273   // Side Effects:
1274   //   disjoint_int_copy_entry is set to the no-overlap entry point
1275   //   used by generate_conjoint_int_oop_copy().
1276   //
1277   address generate_disjoint_int_copy(bool aligned, address* entry,
1278                                      const char* name, bool dest_uninitialized = false) {
1279     const bool not_oop = false;
1280     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1281   }
1282 
1283   // Arguments:
1284   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1285   //             ignored
1286   //   name    - stub name string
1287   //
1288   // Inputs:
1289   //   c_rarg0   - source array address
1290   //   c_rarg1   - destination array address
1291   //   c_rarg2   - element count, treated as ssize_t, can be zero
1292   //
1293   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1294   // the hardware handle it.  The two dwords within qwords that span
1295   // cache line boundaries will still be loaded and stored atomicly.
1296   //
1297   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1298                                      address* entry, const char* name,
1299                                      bool dest_uninitialized = false) {
1300     const bool not_oop = false;
1301     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1302   }
1303 
1304 
1305   // Arguments:
1306   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1307   //             ignored
1308   //   name    - stub name string
1309   //
1310   // Inputs:
1311   //   c_rarg0   - source array address
1312   //   c_rarg1   - destination array address
1313   //   c_rarg2   - element count, treated as size_t, can be zero
1314   //
1315   // Side Effects:
1316   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1317   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1318   //
1319   address generate_disjoint_long_copy(bool aligned, address* entry,
1320                                       const char* name, bool dest_uninitialized = false) {
1321     const bool not_oop = false;
1322     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1323   }
1324 
1325   // Arguments:
1326   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1327   //             ignored
1328   //   name    - stub name string
1329   //
1330   // Inputs:
1331   //   c_rarg0   - source array address
1332   //   c_rarg1   - destination array address
1333   //   c_rarg2   - element count, treated as size_t, can be zero
1334   //
1335   address generate_conjoint_long_copy(bool aligned,
1336                                       address nooverlap_target, address* entry,
1337                                       const char* name, bool dest_uninitialized = false) {
1338     const bool not_oop = false;
1339     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1340   }
1341 
1342   // Arguments:
1343   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1344   //             ignored
1345   //   name    - stub name string
1346   //
1347   // Inputs:
1348   //   c_rarg0   - source array address
1349   //   c_rarg1   - destination array address
1350   //   c_rarg2   - element count, treated as size_t, can be zero
1351   //
1352   // Side Effects:
1353   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1354   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1355   //
1356   address generate_disjoint_oop_copy(bool aligned, address* entry,
1357                                      const char* name, bool dest_uninitialized) {
1358     const bool is_oop = true;
1359     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1360     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1361   }
1362 
1363   // Arguments:
1364   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1365   //             ignored
1366   //   name    - stub name string
1367   //
1368   // Inputs:
1369   //   c_rarg0   - source array address
1370   //   c_rarg1   - destination array address
1371   //   c_rarg2   - element count, treated as size_t, can be zero
1372   //
1373   address generate_conjoint_oop_copy(bool aligned,
1374                                      address nooverlap_target, address* entry,
1375                                      const char* name, bool dest_uninitialized) {
1376     const bool is_oop = true;
1377     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1378     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1379                                   name, dest_uninitialized);
1380   }
1381 
1382   // Helper for generating a dynamic type check.
1383   // Smashes t0, t1.
1384   void generate_type_check(Register sub_klass,
1385                            Register super_check_offset,
1386                            Register super_klass,
1387                            Label& L_success) {
1388     assert_different_registers(sub_klass, super_check_offset, super_klass);
1389 
1390     BLOCK_COMMENT("type_check:");
1391 
1392     Label L_miss;
1393 
1394     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, super_check_offset);
1395     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1396 
1397     // Fall through on failure!
1398     __ BIND(L_miss);
1399   }
1400 
1401   //
1402   //  Generate checkcasting array copy stub
1403   //
1404   //  Input:
1405   //    c_rarg0   - source array address
1406   //    c_rarg1   - destination array address
1407   //    c_rarg2   - element count, treated as ssize_t, can be zero
1408   //    c_rarg3   - size_t ckoff (super_check_offset)
1409   //    c_rarg4   - oop ckval (super_klass)
1410   //
1411   //  Output:
1412   //    x10 ==  0  -  success
1413   //    x10 == -1^K - failure, where K is partial transfer count
1414   //
1415   address generate_checkcast_copy(const char* name, address* entry,
1416                                   bool dest_uninitialized = false) {
1417     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1418 
1419     // Input registers (after setup_arg_regs)
1420     const Register from        = c_rarg0;   // source array address
1421     const Register to          = c_rarg1;   // destination array address
1422     const Register count       = c_rarg2;   // elementscount
1423     const Register ckoff       = c_rarg3;   // super_check_offset
1424     const Register ckval       = c_rarg4;   // super_klass
1425 
1426     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1427     RegSet wb_post_saved_regs  = RegSet::of(count);
1428 
1429     // Registers used as temps (x7, x9, x18 are save-on-entry)
1430     const Register count_save  = x19;       // orig elementscount
1431     const Register start_to    = x18;       // destination array start address
1432     const Register copied_oop  = x7;        // actual oop copied
1433     const Register r9_klass    = x9;        // oop._klass
1434 
1435     //---------------------------------------------------------------
1436     // Assembler stub will be used for this call to arraycopy
1437     // if the two arrays are subtypes of Object[] but the
1438     // destination array type is not equal to or a supertype
1439     // of the source type.  Each element must be separately
1440     // checked.
1441 
1442     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1443                                copied_oop, r9_klass, count_save);
1444 
1445     __ align(CodeEntryAlignment);
1446     StubCodeMark mark(this, "StubRoutines", name);
1447     address start = __ pc();
1448 
1449     __ enter(); // required for proper stackwalking of RuntimeStub frame
1450 
1451     // Caller of this entry point must set up the argument registers.
1452     if (entry != NULL) {
1453       *entry = __ pc();
1454       BLOCK_COMMENT("Entry:");
1455     }
1456 
1457     // Empty array:  Nothing to do
1458     __ beqz(count, L_done);
1459 
1460     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1461 
1462 #ifdef ASSERT
1463     BLOCK_COMMENT("assert consistent ckoff/ckval");
1464     // The ckoff and ckval must be mutually consistent,
1465     // even though caller generates both.
1466     { Label L;
1467       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1468       __ lwu(start_to, Address(ckval, sco_offset));
1469       __ beq(ckoff, start_to, L);
1470       __ stop("super_check_offset inconsistent");
1471       __ bind(L);
1472     }
1473 #endif //ASSERT
1474 
1475     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1476     bool is_oop = true;
1477     if (dest_uninitialized) {
1478       decorators |= IS_DEST_UNINITIALIZED;
1479     }
1480 
1481     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1482     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1483 
1484     // save the original count
1485     __ mv(count_save, count);
1486 
1487     // Copy from low to high addresses
1488     __ mv(start_to, to);              // Save destination array start address
1489     __ j(L_load_element);
1490 
1491     // ======== begin loop ========
1492     // (Loop is rotated; its entry is L_load_element.)
1493     // Loop control:
1494     //   for count to 0 do
1495     //     copied_oop = load_heap_oop(from++)
1496     //     ... generate_type_check ...
1497     //     store_heap_oop(to++, copied_oop)
1498     //   end
1499 
1500     __ align(OptoLoopAlignment);
1501 
1502     __ BIND(L_store_element);
1503     __ store_heap_oop(Address(to, 0), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1504     __ add(to, to, UseCompressedOops ? 4 : 8);
1505     __ sub(count, count, 1);
1506     __ beqz(count, L_do_card_marks);
1507 
1508     // ======== loop entry is here ========
1509     __ BIND(L_load_element);
1510     __ load_heap_oop(copied_oop, Address(from, 0), noreg, noreg, AS_RAW); // load the oop
1511     __ add(from, from, UseCompressedOops ? 4 : 8);
1512     __ beqz(copied_oop, L_store_element);
1513 
1514     __ load_klass(r9_klass, copied_oop);// query the object klass
1515     generate_type_check(r9_klass, ckoff, ckval, L_store_element);
1516     // ======== end loop ========
1517 
1518     // It was a real error; we must depend on the caller to finish the job.
1519     // Register count = remaining oops, count_orig = total oops.
1520     // Emit GC store barriers for the oops we have copied and report
1521     // their number to the caller.
1522 
1523     __ sub(count, count_save, count);     // K = partially copied oop count
1524     __ xori(count, count, -1);                   // report (-1^K) to caller
1525     __ beqz(count, L_done_pop);
1526 
1527     __ BIND(L_do_card_marks);
1528     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
1529 
1530     __ bind(L_done_pop);
1531     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1532     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1533 
1534     __ bind(L_done);
1535     __ mv(x10, count);
1536     __ leave();
1537     __ ret();
1538 
1539     return start;
1540   }
1541 
1542   // Perform range checks on the proposed arraycopy.
1543   // Kills temp, but nothing else.
1544   // Also, clean the sign bits of src_pos and dst_pos.
1545   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1546                               Register src_pos, // source position (c_rarg1)
1547                               Register dst,     // destination array oo (c_rarg2)
1548                               Register dst_pos, // destination position (c_rarg3)
1549                               Register length,
1550                               Register temp,
1551                               Label& L_failed) {
1552     BLOCK_COMMENT("arraycopy_range_checks:");
1553 
1554     assert_different_registers(t0, temp);
1555 
1556     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1557     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1558     __ addw(temp, length, src_pos);
1559     __ bgtu(temp, t0, L_failed);
1560 
1561     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1562     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1563     __ addw(temp, length, dst_pos);
1564     __ bgtu(temp, t0, L_failed);
1565 
1566     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1567     __ clear_upper_bits(src_pos, 32);
1568     __ clear_upper_bits(dst_pos, 32);
1569 
1570     BLOCK_COMMENT("arraycopy_range_checks done");
1571   }
1572 
1573   //
1574   //  Generate 'unsafe' array copy stub
1575   //  Though just as safe as the other stubs, it takes an unscaled
1576   //  size_t argument instead of an element count.
1577   //
1578   //  Input:
1579   //    c_rarg0   - source array address
1580   //    c_rarg1   - destination array address
1581   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1582   //
1583   // Examines the alignment of the operands and dispatches
1584   // to a long, int, short, or byte copy loop.
1585   //
1586   address generate_unsafe_copy(const char* name,
1587                                address byte_copy_entry,
1588                                address short_copy_entry,
1589                                address int_copy_entry,
1590                                address long_copy_entry) {
1591     assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL &&
1592                 int_copy_entry != NULL && long_copy_entry != NULL);
1593     Label L_long_aligned, L_int_aligned, L_short_aligned;
1594     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1595 
1596     __ align(CodeEntryAlignment);
1597     StubCodeMark mark(this, "StubRoutines", name);
1598     address start = __ pc();
1599     __ enter(); // required for proper stackwalking of RuntimeStub frame
1600 
1601     // bump this on entry, not on exit:
1602     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1603 
1604     __ orr(t0, s, d);
1605     __ orr(t0, t0, count);
1606 
1607     __ andi(t0, t0, BytesPerLong - 1);
1608     __ beqz(t0, L_long_aligned);
1609     __ andi(t0, t0, BytesPerInt - 1);
1610     __ beqz(t0, L_int_aligned);
1611     __ andi(t0, t0, 1);
1612     __ beqz(t0, L_short_aligned);
1613     __ j(RuntimeAddress(byte_copy_entry));
1614 
1615     __ BIND(L_short_aligned);
1616     __ srli(count, count, LogBytesPerShort);  // size => short_count
1617     __ j(RuntimeAddress(short_copy_entry));
1618     __ BIND(L_int_aligned);
1619     __ srli(count, count, LogBytesPerInt);    // size => int_count
1620     __ j(RuntimeAddress(int_copy_entry));
1621     __ BIND(L_long_aligned);
1622     __ srli(count, count, LogBytesPerLong);   // size => long_count
1623     __ j(RuntimeAddress(long_copy_entry));
1624 
1625     return start;
1626   }
1627 
1628   //
1629   //  Generate generic array copy stubs
1630   //
1631   //  Input:
1632   //    c_rarg0    -  src oop
1633   //    c_rarg1    -  src_pos (32-bits)
1634   //    c_rarg2    -  dst oop
1635   //    c_rarg3    -  dst_pos (32-bits)
1636   //    c_rarg4    -  element count (32-bits)
1637   //
1638   //  Output:
1639   //    x10 ==  0  -  success
1640   //    x10 == -1^K - failure, where K is partial transfer count
1641   //
1642   address generate_generic_copy(const char* name,
1643                                 address byte_copy_entry, address short_copy_entry,
1644                                 address int_copy_entry, address oop_copy_entry,
1645                                 address long_copy_entry, address checkcast_copy_entry) {
1646     assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL &&
1647                 int_copy_entry != NULL && oop_copy_entry != NULL &&
1648                 long_copy_entry != NULL && checkcast_copy_entry != NULL);
1649     Label L_failed, L_failed_0, L_objArray;
1650     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1651 
1652     // Input registers
1653     const Register src        = c_rarg0;  // source array oop
1654     const Register src_pos    = c_rarg1;  // source position
1655     const Register dst        = c_rarg2;  // destination array oop
1656     const Register dst_pos    = c_rarg3;  // destination position
1657     const Register length     = c_rarg4;
1658 
1659     // Registers used as temps
1660     const Register dst_klass = c_rarg5;
1661 
1662     __ align(CodeEntryAlignment);
1663 
1664     StubCodeMark mark(this, "StubRoutines", name);
1665 
1666     address start = __ pc();
1667 
1668     __ enter(); // required for proper stackwalking of RuntimeStub frame
1669 
1670     // bump this on entry, not on exit:
1671     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1672 
1673     //-----------------------------------------------------------------------
1674     // Assembler stub will be used for this call to arraycopy
1675     // if the following conditions are met:
1676     //
1677     // (1) src and dst must not be null.
1678     // (2) src_pos must not be negative.
1679     // (3) dst_pos must not be negative.
1680     // (4) length  must not be negative.
1681     // (5) src klass and dst klass should be the same and not NULL.
1682     // (6) src and dst should be arrays.
1683     // (7) src_pos + length must not exceed length of src.
1684     // (8) dst_pos + length must not exceed length of dst.
1685     //
1686 
1687     // if [src == NULL] then return -1
1688     __ beqz(src, L_failed);
1689 
1690     // if [src_pos < 0] then return -1
1691     // i.e. sign bit set
1692     __ andi(t0, src_pos, 1UL << 31);
1693     __ bnez(t0, L_failed);
1694 
1695     // if [dst == NULL] then return -1
1696     __ beqz(dst, L_failed);
1697 
1698     // if [dst_pos < 0] then return -1
1699     // i.e. sign bit set
1700     __ andi(t0, dst_pos, 1UL << 31);
1701     __ bnez(t0, L_failed);
1702 
1703     // registers used as temp
1704     const Register scratch_length    = x28; // elements count to copy
1705     const Register scratch_src_klass = x29; // array klass
1706     const Register lh                = x30; // layout helper
1707 
1708     // if [length < 0] then return -1
1709     __ addw(scratch_length, length, zr);    // length (elements count, 32-bits value)
1710     // i.e. sign bit set
1711     __ andi(t0, scratch_length, 1UL << 31);
1712     __ bnez(t0, L_failed);
1713 
1714     __ load_klass(scratch_src_klass, src);
1715 #ifdef ASSERT
1716     {
1717       BLOCK_COMMENT("assert klasses not null {");
1718       Label L1, L2;
1719       __ bnez(scratch_src_klass, L2);   // it is broken if klass is NULL
1720       __ bind(L1);
1721       __ stop("broken null klass");
1722       __ bind(L2);
1723       __ load_klass(t0, dst);
1724       __ beqz(t0, L1);     // this would be broken also
1725       BLOCK_COMMENT("} assert klasses not null done");
1726     }
1727 #endif
1728 
1729     // Load layout helper (32-bits)
1730     //
1731     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1732     // 32        30    24            16              8     2                 0
1733     //
1734     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1735     //
1736 
1737     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1738 
1739     // Handle objArrays completely differently...
1740     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1741     __ lw(lh, Address(scratch_src_klass, lh_offset));
1742     __ mvw(t0, objArray_lh);
1743     __ beq(lh, t0, L_objArray);
1744 
1745     // if [src->klass() != dst->klass()] then return -1
1746     __ load_klass(t1, dst);
1747     __ bne(t1, scratch_src_klass, L_failed);
1748 
1749     // if [src->is_Array() != NULL] then return -1
1750     // i.e. (lh >= 0)
1751     __ andi(t0, lh, 1UL << 31);
1752     __ beqz(t0, L_failed);
1753 
1754     // At this point, it is known to be a typeArray (array_tag 0x3).
1755 #ifdef ASSERT
1756     {
1757       BLOCK_COMMENT("assert primitive array {");
1758       Label L;
1759       __ mvw(t1, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
1760       __ bge(lh, t1, L);
1761       __ stop("must be a primitive array");
1762       __ bind(L);
1763       BLOCK_COMMENT("} assert primitive array done");
1764     }
1765 #endif
1766 
1767     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1768                            t1, L_failed);
1769 
1770     // TypeArrayKlass
1771     //
1772     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1773     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1774     //
1775 
1776     const Register t0_offset = t0;    // array offset
1777     const Register x22_elsize = lh;   // element size
1778 
1779     // Get array_header_in_bytes()
1780     int lh_header_size_width = log2i_exact(Klass::_lh_header_size_mask + 1);
1781     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1782     __ slli(t0_offset, lh, registerSize - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1783     __ srli(t0_offset, t0_offset, registerSize - lh_header_size_width); // array_offset
1784 
1785     __ add(src, src, t0_offset);           // src array offset
1786     __ add(dst, dst, t0_offset);           // dst array offset
1787     BLOCK_COMMENT("choose copy loop based on element size");
1788 
1789     // next registers should be set before the jump to corresponding stub
1790     const Register from     = c_rarg0;  // source array address
1791     const Register to       = c_rarg1;  // destination array address
1792     const Register count    = c_rarg2;  // elements count
1793 
1794     // 'from', 'to', 'count' registers should be set in such order
1795     // since they are the same as 'src', 'src_pos', 'dst'.
1796 
1797     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1798 
1799     // The possible values of elsize are 0-3, i.e. log2i_exact(element
1800     // size in bytes).  We do a simple bitwise binary search.
1801   __ BIND(L_copy_bytes);
1802     __ andi(t0, x22_elsize, 2);
1803     __ bnez(t0, L_copy_ints);
1804     __ andi(t0, x22_elsize, 1);
1805     __ bnez(t0, L_copy_shorts);
1806     __ add(from, src, src_pos); // src_addr
1807     __ add(to, dst, dst_pos); // dst_addr
1808     __ addw(count, scratch_length, zr); // length
1809     __ j(RuntimeAddress(byte_copy_entry));
1810 
1811   __ BIND(L_copy_shorts);
1812     __ slli(t0, src_pos, 1);
1813     __ add(from, src, t0); // src_addr
1814     __ slli(t0, dst_pos, 1);
1815     __ add(to, dst, t0); // dst_addr
1816     __ addw(count, scratch_length, zr); // length
1817     __ j(RuntimeAddress(short_copy_entry));
1818 
1819   __ BIND(L_copy_ints);
1820     __ andi(t0, x22_elsize, 1);
1821     __ bnez(t0, L_copy_longs);
1822     __ slli(t0, src_pos, 2);
1823     __ add(from, src, t0); // src_addr
1824     __ slli(t0, dst_pos, 2);
1825     __ add(to, dst, t0); // dst_addr
1826     __ addw(count, scratch_length, zr); // length
1827     __ j(RuntimeAddress(int_copy_entry));
1828 
1829   __ BIND(L_copy_longs);
1830 #ifdef ASSERT
1831     {
1832       BLOCK_COMMENT("assert long copy {");
1833       Label L;
1834       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x22_elsize
1835       __ addw(lh, lh, zr);
1836       __ mvw(t0, LogBytesPerLong);
1837       __ beq(x22_elsize, t0, L);
1838       __ stop("must be long copy, but elsize is wrong");
1839       __ bind(L);
1840       BLOCK_COMMENT("} assert long copy done");
1841     }
1842 #endif
1843     __ slli(t0, src_pos, 3);
1844     __ add(from, src, t0); // src_addr
1845     __ slli(t0, dst_pos, 3);
1846     __ add(to, dst, t0); // dst_addr
1847     __ addw(count, scratch_length, zr); // length
1848     __ j(RuntimeAddress(long_copy_entry));
1849 
1850     // ObjArrayKlass
1851   __ BIND(L_objArray);
1852     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1853 
1854     Label L_plain_copy, L_checkcast_copy;
1855     // test array classes for subtyping
1856     __ load_klass(t2, dst);
1857     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
1858 
1859     // Identically typed arrays can be copied without element-wise checks.
1860     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1861                            t1, L_failed);
1862 
1863     __ slli(t0, src_pos, LogBytesPerHeapOop);
1864     __ add(from, t0, src);
1865     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1866     __ slli(t0, dst_pos, LogBytesPerHeapOop);
1867     __ add(to, t0, dst);
1868     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1869     __ addw(count, scratch_length, zr); // length
1870   __ BIND(L_plain_copy);
1871     __ j(RuntimeAddress(oop_copy_entry));
1872 
1873   __ BIND(L_checkcast_copy);
1874     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
1875     {
1876       // Before looking at dst.length, make sure dst is also an objArray.
1877       __ lwu(t0, Address(t2, lh_offset));
1878       __ mvw(t1, objArray_lh);
1879       __ bne(t0, t1, L_failed);
1880 
1881       // It is safe to examine both src.length and dst.length.
1882       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1883                              t2, L_failed);
1884 
1885       __ load_klass(dst_klass, dst); // reload
1886 
1887       // Marshal the base address arguments now, freeing registers.
1888       __ slli(t0, src_pos, LogBytesPerHeapOop);
1889       __ add(from, t0, src);
1890       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1891       __ slli(t0, dst_pos, LogBytesPerHeapOop);
1892       __ add(to, t0, dst);
1893       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1894       __ addw(count, length, zr);           // length (reloaded)
1895       const Register sco_temp = c_rarg3;      // this register is free now
1896       assert_different_registers(from, to, count, sco_temp,
1897                                  dst_klass, scratch_src_klass);
1898 
1899       // Generate the type check.
1900       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1901       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1902 
1903       // Smashes t0, t1
1904       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
1905 
1906       // Fetch destination element klass from the ObjArrayKlass header.
1907       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1908       __ ld(dst_klass, Address(dst_klass, ek_offset));
1909       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1910 
1911       // the checkcast_copy loop needs two extra arguments:
1912       assert(c_rarg3 == sco_temp, "#3 already in place");
1913       // Set up arguments for checkcast_copy_entry.
1914       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
1915       __ j(RuntimeAddress(checkcast_copy_entry));
1916     }
1917 
1918   __ BIND(L_failed);
1919     __ li(x10, -1);
1920     __ leave();   // required for proper stackwalking of RuntimeStub frame
1921     __ ret();
1922 
1923     return start;
1924   }
1925 
1926   //
1927   // Generate stub for array fill. If "aligned" is true, the
1928   // "to" address is assumed to be heapword aligned.
1929   //
1930   // Arguments for generated stub:
1931   //   to:    c_rarg0
1932   //   value: c_rarg1
1933   //   count: c_rarg2 treated as signed
1934   //
1935   address generate_fill(BasicType t, bool aligned, const char* name) {
1936     __ align(CodeEntryAlignment);
1937     StubCodeMark mark(this, "StubRoutines", name);
1938     address start = __ pc();
1939 
1940     BLOCK_COMMENT("Entry:");
1941 
1942     const Register to        = c_rarg0;  // source array address
1943     const Register value     = c_rarg1;  // value
1944     const Register count     = c_rarg2;  // elements count
1945 
1946     const Register bz_base   = x28;      // base for block_zero routine
1947     const Register cnt_words = x29;      // temp register
1948     const Register tmp_reg   = t1;
1949 
1950     __ enter();
1951 
1952     Label L_fill_elements, L_exit1;
1953 
1954     int shift = -1;
1955     switch (t) {
1956       case T_BYTE:
1957         shift = 0;
1958 
1959         // Zero extend value
1960         // 8 bit -> 16 bit
1961         __ andi(value, value, 0xff);
1962         __ mv(tmp_reg, value);
1963         __ slli(tmp_reg, tmp_reg, 8);
1964         __ orr(value, value, tmp_reg);
1965 
1966         // 16 bit -> 32 bit
1967         __ mv(tmp_reg, value);
1968         __ slli(tmp_reg, tmp_reg, 16);
1969         __ orr(value, value, tmp_reg);
1970 
1971         __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1972         __ bltu(count, tmp_reg, L_fill_elements);
1973         break;
1974       case T_SHORT:
1975         shift = 1;
1976         // Zero extend value
1977         // 16 bit -> 32 bit
1978         __ andi(value, value, 0xffff);
1979         __ mv(tmp_reg, value);
1980         __ slli(tmp_reg, tmp_reg, 16);
1981         __ orr(value, value, tmp_reg);
1982 
1983         // Short arrays (< 8 bytes) fill by element
1984         __ mv(tmp_reg, 8 >> shift);
1985         __ bltu(count, tmp_reg, L_fill_elements);
1986         break;
1987       case T_INT:
1988         shift = 2;
1989 
1990         // Short arrays (< 8 bytes) fill by element
1991         __ mv(tmp_reg, 8 >> shift);
1992         __ bltu(count, tmp_reg, L_fill_elements);
1993         break;
1994       default: ShouldNotReachHere();
1995     }
1996 
1997     // Align source address at 8 bytes address boundary.
1998     Label L_skip_align1, L_skip_align2, L_skip_align4;
1999     if (!aligned) {
2000       switch (t) {
2001         case T_BYTE:
2002           // One byte misalignment happens only for byte arrays.
2003           __ andi(t0, to, 1);
2004           __ beqz(t0, L_skip_align1);
2005           __ sb(value, Address(to, 0));
2006           __ addi(to, to, 1);
2007           __ addiw(count, count, -1);
2008           __ bind(L_skip_align1);
2009           // Fallthrough
2010         case T_SHORT:
2011           // Two bytes misalignment happens only for byte and short (char) arrays.
2012           __ andi(t0, to, 2);
2013           __ beqz(t0, L_skip_align2);
2014           __ sh(value, Address(to, 0));
2015           __ addi(to, to, 2);
2016           __ addiw(count, count, -(2 >> shift));
2017           __ bind(L_skip_align2);
2018           // Fallthrough
2019         case T_INT:
2020           // Align to 8 bytes, we know we are 4 byte aligned to start.
2021           __ andi(t0, to, 4);
2022           __ beqz(t0, L_skip_align4);
2023           __ sw(value, Address(to, 0));
2024           __ addi(to, to, 4);
2025           __ addiw(count, count, -(4 >> shift));
2026           __ bind(L_skip_align4);
2027           break;
2028         default: ShouldNotReachHere();
2029       }
2030     }
2031 
2032     //
2033     //  Fill large chunks
2034     //
2035     __ srliw(cnt_words, count, 3 - shift); // number of words
2036 
2037     // 32 bit -> 64 bit
2038     __ andi(value, value, 0xffffffff);
2039     __ mv(tmp_reg, value);
2040     __ slli(tmp_reg, tmp_reg, 32);
2041     __ orr(value, value, tmp_reg);
2042 
2043     __ slli(tmp_reg, cnt_words, 3 - shift);
2044     __ subw(count, count, tmp_reg);
2045     {
2046       __ fill_words(to, cnt_words, value);
2047     }
2048 
2049     // Remaining count is less than 8 bytes. Fill it by a single store.
2050     // Note that the total length is no less than 8 bytes.
2051     if (t == T_BYTE || t == T_SHORT) {
2052       __ beqz(count, L_exit1);
2053       __ slli(tmp_reg, count, shift);
2054       __ add(to, to, tmp_reg); // points to the end
2055       __ sd(value, Address(to, -8)); // overwrite some elements
2056       __ bind(L_exit1);
2057       __ leave();
2058       __ ret();
2059     }
2060 
2061     // Handle copies less than 8 bytes.
2062     Label L_fill_2, L_fill_4, L_exit2;
2063     __ bind(L_fill_elements);
2064     switch (t) {
2065       case T_BYTE:
2066         __ andi(t0, count, 1);
2067         __ beqz(t0, L_fill_2);
2068         __ sb(value, Address(to, 0));
2069         __ addi(to, to, 1);
2070         __ bind(L_fill_2);
2071         __ andi(t0, count, 2);
2072         __ beqz(t0, L_fill_4);
2073         __ sh(value, Address(to, 0));
2074         __ addi(to, to, 2);
2075         __ bind(L_fill_4);
2076         __ andi(t0, count, 4);
2077         __ beqz(t0, L_exit2);
2078         __ sw(value, Address(to, 0));
2079         break;
2080       case T_SHORT:
2081         __ andi(t0, count, 1);
2082         __ beqz(t0, L_fill_4);
2083         __ sh(value, Address(to, 0));
2084         __ addi(to, to, 2);
2085         __ bind(L_fill_4);
2086         __ andi(t0, count, 2);
2087         __ beqz(t0, L_exit2);
2088         __ sw(value, Address(to, 0));
2089         break;
2090       case T_INT:
2091         __ beqz(count, L_exit2);
2092         __ sw(value, Address(to, 0));
2093         break;
2094       default: ShouldNotReachHere();
2095     }
2096     __ bind(L_exit2);
2097     __ leave();
2098     __ ret();
2099     return start;
2100   }
2101 
2102   void generate_arraycopy_stubs() {
2103     address entry                     = NULL;
2104     address entry_jbyte_arraycopy     = NULL;
2105     address entry_jshort_arraycopy    = NULL;
2106     address entry_jint_arraycopy      = NULL;
2107     address entry_oop_arraycopy       = NULL;
2108     address entry_jlong_arraycopy     = NULL;
2109     address entry_checkcast_arraycopy = NULL;
2110 
2111     generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards);
2112     generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards);
2113 
2114     StubRoutines::riscv64::_zero_blocks = generate_zero_blocks();
2115 
2116     //*** jbyte
2117     // Always need aligned and unaligned versions
2118     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, &entry,
2119                                                                                    "jbyte_disjoint_arraycopy");
2120     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_byte_copy(false, entry,
2121                                                                                    &entry_jbyte_arraycopy,
2122                                                                                    "jbyte_arraycopy");
2123     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, &entry,
2124                                                                                    "arrayof_jbyte_disjoint_arraycopy");
2125     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_byte_copy(true, entry, NULL,
2126                                                                                    "arrayof_jbyte_arraycopy");
2127 
2128     //*** jshort
2129     // Always need aligned and unaligned versions
2130     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2131                                                                                     "jshort_disjoint_arraycopy");
2132     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2133                                                                                     &entry_jshort_arraycopy,
2134                                                                                     "jshort_arraycopy");
2135     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2136                                                                                     "arrayof_jshort_disjoint_arraycopy");
2137     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2138                                                                                     "arrayof_jshort_arraycopy");
2139 
2140     //*** jint
2141     // Aligned versions
2142     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_copy(true, &entry,
2143                                                                                   "arrayof_jint_disjoint_arraycopy");
2144     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2145                                                                                   "arrayof_jint_arraycopy");
2146     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2147     // entry_jint_arraycopy always points to the unaligned version
2148     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_copy(false, &entry,
2149                                                                                   "jint_disjoint_arraycopy");
2150     StubRoutines::_jint_arraycopy                    = generate_conjoint_int_copy(false, entry,
2151                                                                                   &entry_jint_arraycopy,
2152                                                                                   "jint_arraycopy");
2153 
2154     //*** jlong
2155     // It is always aligned
2156     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, &entry,
2157                                                                                    "arrayof_jlong_disjoint_arraycopy");
2158     StubRoutines::_arrayof_jlong_arraycopy           = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2159                                                                                    "arrayof_jlong_arraycopy");
2160     StubRoutines::_jlong_disjoint_arraycopy          = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2161     StubRoutines::_jlong_arraycopy                   = StubRoutines::_arrayof_jlong_arraycopy;
2162 
2163     //*** oops
2164     {
2165       // With compressed oops we need unaligned versions; notice that
2166       // we overwrite entry_oop_arraycopy.
2167       bool aligned = !UseCompressedOops;
2168 
2169       StubRoutines::_arrayof_oop_disjoint_arraycopy
2170         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2171                                      /*dest_uninitialized*/false);
2172       StubRoutines::_arrayof_oop_arraycopy
2173         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2174                                      /*dest_uninitialized*/false);
2175       // Aligned versions without pre-barriers
2176       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2177         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2178                                      /*dest_uninitialized*/true);
2179       StubRoutines::_arrayof_oop_arraycopy_uninit
2180         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2181                                      /*dest_uninitialized*/true);
2182     }
2183 
2184     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2185     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2186     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2187     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2188 
2189     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2190     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2191                                                                         /*dest_uninitialized*/true);
2192 
2193 
2194     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2195                                                               entry_jbyte_arraycopy,
2196                                                               entry_jshort_arraycopy,
2197                                                               entry_jint_arraycopy,
2198                                                               entry_jlong_arraycopy);
2199 
2200     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2201                                                                entry_jbyte_arraycopy,
2202                                                                entry_jshort_arraycopy,
2203                                                                entry_jint_arraycopy,
2204                                                                entry_oop_arraycopy,
2205                                                                entry_jlong_arraycopy,
2206                                                                entry_checkcast_arraycopy);
2207 
2208     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2209     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2210     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2211     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2212     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2213     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2214   }
2215 
2216   // Safefetch stubs.
2217   void generate_safefetch(const char* name, int size, address* entry,
2218                           address* fault_pc, address* continuation_pc) {
2219     // safefetch signatures:
2220     //   int      SafeFetch32(int*      adr, int      errValue)
2221     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue)
2222     //
2223     // arguments:
2224     //   c_rarg0 = adr
2225     //   c_rarg1 = errValue
2226     //
2227     // result:
2228     //   PPC_RET  = *adr or errValue
2229     assert_cond(entry != NULL && fault_pc != NULL && continuation_pc != NULL);
2230     StubCodeMark mark(this, "StubRoutines", name);
2231 
2232     // Entry point, pc or function descriptor.
2233     *entry = __ pc();
2234 
2235     // Load *adr into c_rarg1, may fault.
2236     *fault_pc = __ pc();
2237     switch (size) {
2238       case 4:
2239         // int32_t
2240         __ lw(c_rarg1, Address(c_rarg0, 0));
2241         break;
2242       case 8:
2243         // int64_t
2244         __ ld(c_rarg1, Address(c_rarg0, 0));
2245         break;
2246       default:
2247         ShouldNotReachHere();
2248     }
2249 
2250     // return errValue or *adr
2251     *continuation_pc = __ pc();
2252     __ mv(x10, c_rarg1);
2253     __ ret();
2254   }
2255 
2256   // code for comparing 16 bytes of strings with same encoding
2257   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
2258     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31;
2259     __ ld(tmp5, Address(str1));
2260     __ addi(str1, str1, 8);
2261     __ xorr(tmp4, tmp1, tmp2);
2262     __ ld(cnt1, Address(str2));
2263     __ addi(str2, str2, 8);
2264     __ bnez(tmp4, DIFF1);
2265     __ ld(tmp1, Address(str1));
2266     __ addi(str1, str1, 8);
2267     __ xorr(tmp4, tmp5, cnt1);
2268     __ ld(tmp2, Address(str2));
2269     __ addi(str2, str2, 8);
2270     __ bnez(tmp4, DIFF2);
2271   }
2272 
2273   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
2274   void compare_string_8_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
2275                               Label &DIFF2) {
2276     const Register strU = x12, curU = x7, strL = x29, tmp = x30;
2277     __ ld(tmpL, Address(strL));
2278     __ addi(strL, strL, 8);
2279     __ ld(tmpU, Address(strU));
2280     __ addi(strU, strU, 8);
2281     __ inflate_lo32(tmp, tmpL);
2282     __ mv(t0, tmp);
2283     __ xorr(tmp, curU, t0);
2284     __ bnez(tmp, DIFF2);
2285 
2286     __ ld(curU, Address(strU));
2287     __ addi(strU, strU, 8);
2288     __ inflate_hi32(tmp, tmpL);
2289     __ mv(t0, tmp);
2290     __ xorr(tmp, tmpU, t0);
2291     __ bnez(tmp, DIFF1);
2292   }
2293 
2294   // x10  = result
2295   // x11  = str1
2296   // x12  = cnt1
2297   // x13  = str2
2298   // x14  = cnt2
2299   // x28  = tmp1
2300   // x29  = tmp2
2301   // x30  = tmp3
2302   address generate_compare_long_string_different_encoding(bool isLU) {
2303     __ align(CodeEntryAlignment);
2304     StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL");
2305     address entry = __ pc();
2306     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
2307           DONE, CALCULATE_DIFFERENCE;
2308     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2309                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2310     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2311 
2312     // cnt2 == amount of characters left to compare
2313     // Check already loaded first 4 symbols
2314     __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2);
2315     __ mv(isLU ? tmp1 : tmp2, tmp3);
2316     __ addi(str1, str1, isLU ? wordSize / 2 : wordSize);
2317     __ addi(str2, str2, isLU ? wordSize : wordSize / 2);
2318     __ sub(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
2319     __ push_reg(spilled_regs, sp);
2320 
2321     if (isLU) {
2322       __ add(str1, str1, cnt2);
2323       __ slli(t0, cnt2, 1);
2324       __ add(str2, str2, t0);
2325     } else {
2326       __ slli(t0, cnt2, 1);
2327       __ add(str1, str1, t0);
2328       __ add(str2, str2, cnt2);
2329     }
2330     __ xorr(tmp3, tmp1, tmp2);
2331     __ mv(tmp5, tmp2);
2332     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2333 
2334     Register strU = isLU ? str2 : str1,
2335              strL = isLU ? str1 : str2,
2336              tmpU = isLU ? tmp5 : tmp1, // where to keep U for comparison
2337              tmpL = isLU ? tmp1 : tmp5; // where to keep L for comparison
2338 
2339     __ sub(tmp2, strL, cnt2); // strL pointer to load from
2340     __ slli(t0, cnt2, 1);
2341     __ sub(cnt1, strU, t0); // strU pointer to load from
2342 
2343     __ ld(tmp4, Address(cnt1));
2344     __ addi(cnt1, cnt1, 8);
2345     __ beqz(cnt2, LOAD_LAST); // no characters left except last load
2346     __ sub(cnt2, cnt2, 16);
2347     __ bltz(cnt2, TAIL);
2348     __ bind(SMALL_LOOP); // smaller loop
2349       __ sub(cnt2, cnt2, 16);
2350       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2351       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2352       __ bgez(cnt2, SMALL_LOOP);
2353       __ addi(t0, cnt2, 16);
2354       __ beqz(t0, LOAD_LAST);
2355     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
2356       __ slli(t0, cnt2, 1);
2357       __ add(cnt1, cnt1, t0); // Address of 8 bytes before last 4 characters in UTF-16 string
2358       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
2359       __ ld(tmp4, Address(cnt1, -8));
2360       // last 16 characters before last load
2361       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2362       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2363       __ j(LOAD_LAST);
2364     __ bind(DIFF2);
2365       __ mv(tmpU, tmp4);
2366     __ bind(DIFF1);
2367       __ mv(tmpL, t0);
2368       __ j(CALCULATE_DIFFERENCE);
2369     __ bind(LOAD_LAST);
2370       // Last 4 UTF-16 characters are already pre-loaded into tmp4 by compare_string_8_x_LU.
2371       // No need to load it again
2372       __ mv(tmpU, tmp4);
2373       __ ld(tmpL, Address(strL));
2374       __ inflate_lo32(tmp3, tmpL);
2375       __ mv(tmpL, tmp3);
2376       __ xorr(tmp3, tmpU, tmpL);
2377       __ beqz(tmp3, DONE);
2378 
2379       // Find the first different characters in the longwords and
2380       // compute their difference.
2381     __ bind(CALCULATE_DIFFERENCE);
2382       __ ctzc_bit(tmp4, tmp3);
2383       __ srl(tmp1, tmp1, tmp4);
2384       __ srl(tmp5, tmp5, tmp4);
2385       __ andi(tmp1, tmp1, 0xFFFF);
2386       __ andi(tmp5, tmp5, 0xFFFF);
2387       __ sub(result, tmp1, tmp5);
2388     __ bind(DONE);
2389       __ pop_reg(spilled_regs, sp);
2390       __ ret();
2391     return entry;
2392   }
2393 
2394   address generate_method_entry_barrier() {
2395     __ align(CodeEntryAlignment);
2396     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
2397 
2398     Label deoptimize_label;
2399 
2400     address start = __ pc();
2401 
2402     __ set_last_Java_frame(sp, fp, lr, t0);
2403 
2404     __ enter();
2405     __ add(t1, sp, wordSize);
2406 
2407     __ sub(sp, sp, 4 * wordSize);
2408 
2409     __ push_call_clobbered_registers();
2410 
2411     __ mv(c_rarg0, t1);
2412     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
2413 
2414     __ reset_last_Java_frame(true);
2415 
2416     __ mv(t0, x10);
2417 
2418     __ pop_call_clobbered_registers();
2419 
2420     __ bnez(t0, deoptimize_label);
2421 
2422     __ leave();
2423     __ ret();
2424 
2425     __ BIND(deoptimize_label);
2426 
2427     __ ld(t0, Address(sp, 0));
2428     __ ld(fp, Address(sp, wordSize));
2429     __ ld(lr, Address(sp, wordSize * 2));
2430     __ ld(t1, Address(sp, wordSize * 3));
2431 
2432     __ mv(sp, t0);
2433     __ jr(t1);
2434 
2435     return start;
2436   }
2437 
2438   // x10  = result
2439   // x11  = str1
2440   // x12  = cnt1
2441   // x13  = str2
2442   // x14  = cnt2
2443   // x28  = tmp1
2444   // x29  = tmp2
2445   // x30  = tmp3
2446   // x31  = tmp4
2447   address generate_compare_long_string_same_encoding(bool isLL) {
2448     __ align(CodeEntryAlignment);
2449     StubCodeMark mark(this, "StubRoutines", isLL ?
2450                       "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU");
2451     address entry = __ pc();
2452     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
2453           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
2454     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2455                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2456     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2457 
2458     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
2459     // update cnt2 counter with already loaded 8 bytes
2460     __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
2461     // update pointers, because of previous read
2462     __ add(str1, str1, wordSize);
2463     __ add(str2, str2, wordSize);
2464     // less than 16 bytes left?
2465     __ sub(cnt2, cnt2, isLL ? 16 : 8);
2466     __ push_reg(spilled_regs, sp);
2467     __ bltz(cnt2, TAIL);
2468     __ bind(SMALL_LOOP);
2469       compare_string_16_bytes_same(DIFF, DIFF2);
2470       __ sub(cnt2, cnt2, isLL ? 16 : 8);
2471       __ bgez(cnt2, SMALL_LOOP);
2472     __ bind(TAIL);
2473       __ addi(cnt2, cnt2, isLL ? 16 : 8);
2474       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
2475       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2476       __ blez(cnt2, CHECK_LAST);
2477       __ xorr(tmp4, tmp1, tmp2);
2478       __ bnez(tmp4, DIFF);
2479       __ ld(tmp1, Address(str1));
2480       __ addi(str1, str1, 8);
2481       __ ld(tmp2, Address(str2));
2482       __ addi(str2, str2, 8);
2483       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2484     __ bind(CHECK_LAST);
2485       if (!isLL) {
2486         __ add(cnt2, cnt2, cnt2); // now in bytes
2487       }
2488       __ xorr(tmp4, tmp1, tmp2);
2489       __ bnez(tmp4, DIFF);
2490       __ add(str1, str1, cnt2);
2491       __ ld(tmp5, Address(str1));
2492       __ add(str2, str2, cnt2);
2493       __ ld(cnt1, Address(str2));
2494       __ xorr(tmp4, tmp5, cnt1);
2495       __ beqz(tmp4, LENGTH_DIFF);
2496       // Find the first different characters in the longwords and
2497       // compute their difference.
2498     __ bind(DIFF2);
2499       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2500       __ srl(tmp5, tmp5, tmp3);
2501       __ srl(cnt1, cnt1, tmp3);
2502       if (isLL) {
2503         __ andi(tmp5, tmp5, 0xFF);
2504         __ andi(cnt1, cnt1, 0xFF);
2505       } else {
2506         __ andi(tmp5, tmp5, 0xFFFF);
2507         __ andi(cnt1, cnt1, 0xFFFF);
2508       }
2509       __ sub(result, tmp5, cnt1);
2510       __ j(LENGTH_DIFF);
2511     __ bind(DIFF);
2512       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2513       __ srl(tmp1, tmp1, tmp3);
2514       __ srl(tmp2, tmp2, tmp3);
2515       if (isLL) {
2516         __ andi(tmp1, tmp1, 0xFF);
2517         __ andi(tmp2, tmp2, 0xFF);
2518       } else {
2519         __ andi(tmp1, tmp1, 0xFFFF);
2520         __ andi(tmp2, tmp2, 0xFFFF);
2521       }
2522       __ sub(result, tmp1, tmp2);
2523       __ j(LENGTH_DIFF);
2524     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
2525       __ xorr(tmp4, tmp1, tmp2);
2526       __ bnez(tmp4, DIFF);
2527     __ bind(LENGTH_DIFF);
2528       __ pop_reg(spilled_regs, sp);
2529       __ ret();
2530     return entry;
2531   }
2532 
2533   void generate_compare_long_strings() {
2534     StubRoutines::riscv64::_compare_long_string_LL = generate_compare_long_string_same_encoding(true);
2535     StubRoutines::riscv64::_compare_long_string_UU = generate_compare_long_string_same_encoding(false);
2536     StubRoutines::riscv64::_compare_long_string_LU = generate_compare_long_string_different_encoding(true);
2537     StubRoutines::riscv64::_compare_long_string_UL = generate_compare_long_string_different_encoding(false);
2538   }
2539 
2540   // x10 result
2541   // x11 src
2542   // x12 src count
2543   // x13 pattern
2544   // x14 pattern count
2545   address generate_string_indexof_linear(bool needle_isL, bool haystack_isL)
2546   {
2547     const char* stubName = needle_isL
2548            ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul")
2549            : "indexof_linear_uu";
2550     __ align(CodeEntryAlignment);
2551     StubCodeMark mark(this, "StubRoutines", stubName);
2552     address entry = __ pc();
2553 
2554     int needle_chr_size = needle_isL ? 1 : 2;
2555     int haystack_chr_size = haystack_isL ? 1 : 2;
2556     int needle_chr_shift = needle_isL ? 0 : 1;
2557     int haystack_chr_shift = haystack_isL ? 0 : 1;
2558     bool isL = needle_isL && haystack_isL;
2559     // parameters
2560     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
2561     // temporary registers
2562     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
2563     // redefinitions
2564     Register ch1 = x28, ch2 = x29;
2565     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
2566 
2567     __ push_reg(spilled_regs, sp);
2568 
2569     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
2570           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
2571           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
2572           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
2573           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
2574           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
2575 
2576     __ ld(ch1, Address(needle));
2577     __ ld(ch2, Address(haystack));
2578     // src.length - pattern.length
2579     __ sub(haystack_len, haystack_len, needle_len);
2580 
2581     // first is needle[0]
2582     __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first);
2583     __ mv(mask1, haystack_isL ? 0x0101010101010101 : 0x0001000100010001);
2584     __ mul(first, first, mask1);
2585     __ mv(mask2, haystack_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
2586     if (needle_isL != haystack_isL) {
2587       __ mv(tmp, ch1);
2588     }
2589     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
2590     __ blez(haystack_len, L_SMALL);
2591 
2592     if (needle_isL != haystack_isL) {
2593       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2594     }
2595     // xorr, sub, orr, notr, andr
2596     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
2597     // eg:
2598     // first:        aa aa aa aa aa aa aa aa
2599     // ch2:          aa aa li nx jd ka aa aa
2600     // match_mask:   80 80 00 00 00 00 80 80
2601     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2602 
2603     // search first char of needle, if success, goto L_HAS_ZERO;
2604     __ bnez(match_mask, L_HAS_ZERO);
2605     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2606     __ add(result, result, wordSize / haystack_chr_size);
2607     __ add(haystack, haystack, wordSize);
2608     __ bltz(haystack_len, L_POST_LOOP);
2609 
2610     __ bind(L_LOOP);
2611     __ ld(ch2, Address(haystack));
2612     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2613     __ bnez(match_mask, L_HAS_ZERO);
2614 
2615     __ bind(L_LOOP_PROCEED);
2616     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2617     __ add(haystack, haystack, wordSize);
2618     __ add(result, result, wordSize / haystack_chr_size);
2619     __ bgez(haystack_len, L_LOOP);
2620 
2621     __ bind(L_POST_LOOP);
2622     __ mv(ch2, -wordSize / haystack_chr_size);
2623     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
2624     __ ld(ch2, Address(haystack));
2625     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2626     __ neg(haystack_len, haystack_len);
2627     __ xorr(ch2, first, ch2);
2628     __ sub(match_mask, ch2, mask1);
2629     __ orr(ch2, ch2, mask2);
2630     __ mv(trailing_zeros, -1); // all bits set
2631     __ j(L_SMALL_PROCEED);
2632 
2633     __ align(OptoLoopAlignment);
2634     __ bind(L_SMALL);
2635     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2636     __ neg(haystack_len, haystack_len);
2637     if (needle_isL != haystack_isL) {
2638       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2639     }
2640     __ xorr(ch2, first, ch2);
2641     __ sub(match_mask, ch2, mask1);
2642     __ orr(ch2, ch2, mask2);
2643     __ mv(trailing_zeros, -1); // all bits set
2644 
2645     __ bind(L_SMALL_PROCEED);
2646     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
2647     __ notr(ch2, ch2);
2648     __ andr(match_mask, match_mask, ch2);
2649     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
2650     __ beqz(match_mask, NOMATCH);
2651 
2652     __ bind(L_SMALL_HAS_ZERO_LOOP);
2653     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros
2654     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2655     __ mv(ch2, wordSize / haystack_chr_size);
2656     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
2657     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2658     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2659     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2660 
2661     __ bind(L_SMALL_CMP_LOOP);
2662     __ slli(first, trailing_zeros, needle_chr_shift);
2663     __ add(first, needle, first);
2664     __ slli(ch2, trailing_zeros, haystack_chr_shift);
2665     __ add(ch2, haystack, ch2);
2666     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
2667     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2668     __ add(trailing_zeros, trailing_zeros, 1);
2669     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
2670     __ beq(first, ch2, L_SMALL_CMP_LOOP);
2671 
2672     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
2673     __ beqz(match_mask, NOMATCH);
2674     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2675     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2676     __ add(result, result, 1);
2677     __ add(haystack, haystack, haystack_chr_size);
2678     __ j(L_SMALL_HAS_ZERO_LOOP);
2679 
2680     __ align(OptoLoopAlignment);
2681     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
2682     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2683     __ j(DONE);
2684 
2685     __ align(OptoLoopAlignment);
2686     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
2687     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2688     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2689     __ j(DONE);
2690 
2691     __ align(OptoLoopAlignment);
2692     __ bind(L_HAS_ZERO);
2693     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2694     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2695     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
2696     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
2697     __ sub(result, result, 1); // array index from 0, so result -= 1
2698 
2699     __ bind(L_HAS_ZERO_LOOP);
2700     __ mv(needle_len, wordSize / haystack_chr_size);
2701     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
2702     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
2703     // load next 8 bytes from haystack, and increase result index
2704     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2705     __ add(result, result, 1);
2706     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2707     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2708 
2709     // compare one char
2710     __ bind(L_CMP_LOOP);
2711     __ slli(needle_len, trailing_zeros, needle_chr_shift);
2712     __ add(needle_len, needle, needle_len);
2713     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
2714     __ slli(ch2, trailing_zeros, haystack_chr_shift);
2715     __ add(ch2, haystack, ch2);
2716     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2717     __ add(trailing_zeros, trailing_zeros, 1); // next char index
2718     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
2719     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
2720     __ beq(needle_len, ch2, L_CMP_LOOP);
2721 
2722     __ bind(L_CMP_LOOP_NOMATCH);
2723     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
2724     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index
2725     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2726     __ add(haystack, haystack, haystack_chr_size);
2727     __ j(L_HAS_ZERO_LOOP);
2728 
2729     __ align(OptoLoopAlignment);
2730     __ bind(L_CMP_LOOP_LAST_CMP);
2731     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
2732     __ j(DONE);
2733 
2734     __ align(OptoLoopAlignment);
2735     __ bind(L_CMP_LOOP_LAST_CMP2);
2736     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2737     __ add(result, result, 1);
2738     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2739     __ j(DONE);
2740 
2741     __ align(OptoLoopAlignment);
2742     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
2743     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
2744     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
2745     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
2746     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
2747     // result by analyzed characters value, so, we can just reset lower bits
2748     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
2749     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
2750     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
2751     // index of last analyzed substring inside current octet. So, haystack in at
2752     // respective start address. We need to advance it to next octet
2753     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
2754     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
2755     __ andi(result, result, haystack_isL ? -8 : -4);
2756     __ slli(tmp, match_mask, haystack_chr_shift);
2757     __ sub(haystack, haystack, tmp);
2758     __ addw(haystack_len, haystack_len, zr);
2759     __ j(L_LOOP_PROCEED);
2760 
2761     __ align(OptoLoopAlignment);
2762     __ bind(NOMATCH);
2763     __ mv(result, -1);
2764 
2765     __ bind(DONE);
2766     __ pop_reg(spilled_regs, sp);
2767     __ ret();
2768     return entry;
2769   }
2770 
2771   void generate_string_indexof_stubs()
2772   {
2773     StubRoutines::riscv64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
2774     StubRoutines::riscv64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
2775     StubRoutines::riscv64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
2776   }
2777 
2778   // Continuation point for throwing of implicit exceptions that are
2779   // not handled in the current activation. Fabricates an exception
2780   // oop and initiates normal exception dispatching in this
2781   // frame. Since we need to preserve callee-saved values (currently
2782   // only for C2, but done for C1 as well) we need a callee-saved oop
2783   // map and therefore have to make these stubs into RuntimeStubs
2784   // rather than BufferBlobs.  If the compiler needs all registers to
2785   // be preserved between the fault point and the exception handler
2786   // then it must assume responsibility for that in
2787   // AbstractCompiler::continuation_for_implicit_null_exception or
2788   // continuation_for_implicit_division_by_zero_exception. All other
2789   // implicit exceptions (e.g., NullPointerException or
2790   // AbstractMethodError on entry) are either at call sites or
2791   // otherwise assume that stack unwinding will be initiated, so
2792   // caller saved registers were assumed volatile in the compiler.
2793 
2794 #undef __
2795 #define __ masm->
2796 
2797   address generate_throw_exception(const char* name,
2798                                    address runtime_entry,
2799                                    Register arg1 = noreg,
2800                                    Register arg2 = noreg) {
2801     // Information about frame layout at time of blocking runtime call.
2802     // Note that we only have to preserve callee-saved registers since
2803     // the compilers are responsible for supplying a continuation point
2804     // if they expect all registers to be preserved.
2805     // n.b. riscv64 asserts that frame::arg_reg_save_area_bytes == 0
2806     assert_cond(runtime_entry != NULL);
2807     enum layout {
2808       fp_off = 0,
2809       fp_off2,
2810       return_off,
2811       return_off2,
2812       framesize // inclusive of return address
2813     };
2814 
2815     const int insts_size = 512;
2816     const int locs_size  = 64;
2817 
2818     CodeBuffer code(name, insts_size, locs_size);
2819     OopMapSet* oop_maps  = new OopMapSet();
2820     MacroAssembler* masm = new MacroAssembler(&code);
2821     assert_cond(oop_maps != NULL && masm != NULL);
2822 
2823     address start = __ pc();
2824 
2825     // This is an inlined and slightly modified version of call_VM
2826     // which has the ability to fetch the return PC out of
2827     // thread-local storage and also sets up last_Java_sp slightly
2828     // differently than the real call_VM
2829 
2830     __ enter(); // Save FP and LR before call
2831 
2832     assert(is_even(framesize / 2), "sp not 16-byte aligned");
2833 
2834     // lr and fp are already in place
2835     __ addi(sp, fp, 0 - (((unsigned)framesize - 4) << LogBytesPerInt)); // prolog
2836 
2837     int frame_complete = __ pc() - start;
2838 
2839     // Set up last_Java_sp and last_Java_fp
2840     address the_pc = __ pc();
2841     __ set_last_Java_frame(sp, fp, the_pc, t0);
2842 
2843     // Call runtime
2844     if (arg1 != noreg) {
2845       assert(arg2 != c_rarg1, "clobbered");
2846       __ mv(c_rarg1, arg1);
2847     }
2848     if (arg2 != noreg) {
2849       __ mv(c_rarg2, arg2);
2850     }
2851     __ mv(c_rarg0, xthread);
2852     BLOCK_COMMENT("call runtime_entry");
2853     int32_t offset = 0;
2854     __ movptr_with_offset(t0, runtime_entry, offset);
2855     __ jalr(x1, t0, offset);
2856 
2857     // Generate oop map
2858     OopMap* map = new OopMap(framesize, 0);
2859     assert_cond(map != NULL);
2860 
2861     oop_maps->add_gc_map(the_pc - start, map);
2862 
2863     __ reset_last_Java_frame(true);
2864 
2865     __ leave();
2866 
2867     // check for pending exceptions
2868 #ifdef ASSERT
2869     Label L;
2870     __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
2871     __ bnez(t0, L);
2872     __ should_not_reach_here();
2873     __ bind(L);
2874 #endif // ASSERT
2875     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2876 
2877 
2878     // codeBlob framesize is in words (not VMRegImpl::slot_size)
2879     RuntimeStub* stub =
2880       RuntimeStub::new_runtime_stub(name,
2881                                     &code,
2882                                     frame_complete,
2883                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
2884                                     oop_maps, false);
2885     assert(stub != NULL, "create runtime stub fail!");
2886     return stub->entry_point();
2887   }
2888 
2889   // Initialization
2890   void generate_initial() {
2891     // Generate initial stubs and initializes the entry points
2892 
2893     // entry points that exist in all platforms Note: This is code
2894     // that could be shared among different platforms - however the
2895     // benefit seems to be smaller than the disadvantage of having a
2896     // much more complicated generator structure. See also comment in
2897     // stubRoutines.hpp.
2898 
2899     StubRoutines::_forward_exception_entry = generate_forward_exception();
2900 
2901     StubRoutines::_call_stub_entry =
2902       generate_call_stub(StubRoutines::_call_stub_return_address);
2903 
2904     // is referenced by megamorphic call
2905     StubRoutines::_catch_exception_entry = generate_catch_exception();
2906 
2907     // Build this early so it's available for the interpreter.
2908     StubRoutines::_throw_StackOverflowError_entry =
2909       generate_throw_exception("StackOverflowError throw_exception",
2910                                CAST_FROM_FN_PTR(address,
2911                                                 SharedRuntime::throw_StackOverflowError));
2912     StubRoutines::_throw_delayed_StackOverflowError_entry =
2913       generate_throw_exception("delayed StackOverflowError throw_exception",
2914                                CAST_FROM_FN_PTR(address,
2915                                                 SharedRuntime::throw_delayed_StackOverflowError));
2916     // Safefetch stubs.
2917     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
2918                                                        &StubRoutines::_safefetch32_fault_pc,
2919                                                        &StubRoutines::_safefetch32_continuation_pc);
2920     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2921                                                        &StubRoutines::_safefetchN_fault_pc,
2922                                                        &StubRoutines::_safefetchN_continuation_pc);
2923   }
2924 
2925   void generate_all() {
2926     // support for verify_oop (must happen after universe_init)
2927     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
2928     StubRoutines::_throw_AbstractMethodError_entry =
2929       generate_throw_exception("AbstractMethodError throw_exception",
2930                                CAST_FROM_FN_PTR(address,
2931                                                 SharedRuntime::
2932                                                 throw_AbstractMethodError));
2933 
2934     StubRoutines::_throw_IncompatibleClassChangeError_entry =
2935       generate_throw_exception("IncompatibleClassChangeError throw_exception",
2936                                CAST_FROM_FN_PTR(address,
2937                                                 SharedRuntime::
2938                                                 throw_IncompatibleClassChangeError));
2939 
2940     StubRoutines::_throw_NullPointerException_at_call_entry =
2941       generate_throw_exception("NullPointerException at call throw_exception",
2942                                CAST_FROM_FN_PTR(address,
2943                                                 SharedRuntime::
2944                                                 throw_NullPointerException_at_call));
2945     // arraycopy stubs used by compilers
2946     generate_arraycopy_stubs();
2947 
2948     generate_compare_long_strings();
2949 
2950     generate_string_indexof_stubs();
2951 
2952     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
2953     if (bs_nm != NULL) {
2954       StubRoutines::riscv64::_method_entry_barrier = generate_method_entry_barrier();
2955     }
2956 
2957     StubRoutines::riscv64::set_completed();
2958   }
2959 
2960  public:
2961   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2962     if (all) {
2963       generate_all();
2964     } else {
2965       generate_initial();
2966     }
2967   }
2968 
2969   ~StubGenerator() {}
2970 }; // end class declaration
2971 
2972 #define UCM_TABLE_MAX_ENTRIES 8
2973 void StubGenerator_generate(CodeBuffer* code, bool all) {
2974   if (UnsafeCopyMemory::_table == NULL) {
2975     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
2976   }
2977 
2978   StubGenerator g(code, all);
2979 }