1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/universe.hpp"
  35 #include "nativeInst_riscv.hpp"
  36 #include "oops/instanceOop.hpp"
  37 #include "oops/method.hpp"
  38 #include "oops/objArrayKlass.hpp"
  39 #include "oops/oop.inline.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "runtime/frame.inline.hpp"
  42 #include "runtime/handles.inline.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubCodeGenerator.hpp"
  45 #include "runtime/stubRoutines.hpp"
  46 #include "runtime/thread.inline.hpp"
  47 #include "utilities/align.hpp"
  48 #include "utilities/powerOfTwo.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 #if INCLUDE_ZGC
  53 #include "gc/z/zThreadLocalData.hpp"
  54 #endif
  55 
  56 // Declaration and definition of StubGenerator (no .hpp file).
  57 // For a more detailed description of the stub routine structure
  58 // see the comment in stubRoutines.hpp
  59 
  60 #undef __
  61 #define __ _masm->
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(int& counter) {
  80     __ la(t1, ExternalAddress((address)&counter));
  81     __ lwu(t0, Address(t1, 0));
  82     __ addiw(t0, t0, 1);
  83     __ sw(t0, Address(t1, 0));
  84   }
  85 #define inc_counter_np(counter) \
  86   BLOCK_COMMENT("inc_counter " #counter); \
  87   inc_counter_np_(counter);
  88 #endif
  89 
  90   // Call stubs are used to call Java from C
  91   //
  92   // Arguments:
  93   //    c_rarg0:   call wrapper address                   address
  94   //    c_rarg1:   result                                 address
  95   //    c_rarg2:   result type                            BasicType
  96   //    c_rarg3:   method                                 Method*
  97   //    c_rarg4:   (interpreter) entry point              address
  98   //    c_rarg5:   parameters                             intptr_t*
  99   //    c_rarg6:   parameter size (in words)              int
 100   //    c_rarg7:   thread                                 Thread*
 101   //
 102   // There is no return from the stub itself as any Java result
 103   // is written to result
 104   //
 105   // we save x1 (ra) as the return PC at the base of the frame and
 106   // link x8 (fp) below it as the frame pointer installing sp (x2)
 107   // into fp.
 108   //
 109   // we save x10-x17, which accounts for all the c arguments.
 110   //
 111   // TODO: strictly do we need to save them all? they are treated as
 112   // volatile by C so could we omit saving the ones we are going to
 113   // place in global registers (thread? method?) or those we only use
 114   // during setup of the Java call?
 115   //
 116   // we don't need to save x5 which C uses as an indirect result location
 117   // return register.
 118   //
 119   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 120   // volatile
 121   //
 122   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 123   // registers and C expects to be callee-save
 124   //
 125   // so the stub frame looks like this when we enter Java code
 126   //
 127   //     [ return_from_Java     ] <--- sp
 128   //     [ argument word n      ]
 129   //      ...
 130   // -34 [ argument word 1      ]
 131   // -33 [ saved f27            ] <--- sp_after_call
 132   // -32 [ saved f26            ]
 133   // -31 [ saved f25            ]
 134   // -30 [ saved f24            ]
 135   // -29 [ saved f23            ]
 136   // -28 [ saved f22            ]
 137   // -27 [ saved f21            ]
 138   // -26 [ saved f20            ]
 139   // -25 [ saved f19            ]
 140   // -24 [ saved f18            ]
 141   // -23 [ saved f9             ]
 142   // -22 [ saved f8             ]
 143   // -21 [ saved x27            ]
 144   // -20 [ saved x26            ]
 145   // -19 [ saved x25            ]
 146   // -18 [ saved x24            ]
 147   // -17 [ saved x23            ]
 148   // -16 [ saved x22            ]
 149   // -15 [ saved x21            ]
 150   // -14 [ saved x20            ]
 151   // -13 [ saved x19            ]
 152   // -12 [ saved x18            ]
 153   // -11 [ saved x9             ]
 154   // -10 [ call wrapper   (x10) ]
 155   //  -9 [ result         (x11) ]
 156   //  -8 [ result type    (x12) ]
 157   //  -7 [ method         (x13) ]
 158   //  -6 [ entry point    (x14) ]
 159   //  -5 [ parameters     (x15) ]
 160   //  -4 [ parameter size (x16) ]
 161   //  -3 [ thread         (x17) ]
 162   //  -2 [ saved fp       (x8)  ]
 163   //  -1 [ saved ra       (x1)  ]
 164   //   0 [                      ] <--- fp == saved sp (x2)
 165 
 166   // Call stub stack layout word offsets from fp
 167   enum call_stub_layout {
 168     sp_after_call_off  = -33,
 169 
 170     f27_off            = -33,
 171     f26_off            = -32,
 172     f25_off            = -31,
 173     f24_off            = -30,
 174     f23_off            = -29,
 175     f22_off            = -28,
 176     f21_off            = -27,
 177     f20_off            = -26,
 178     f19_off            = -25,
 179     f18_off            = -24,
 180     f9_off             = -23,
 181     f8_off             = -22,
 182 
 183     x27_off            = -21,
 184     x26_off            = -20,
 185     x25_off            = -19,
 186     x24_off            = -18,
 187     x23_off            = -17,
 188     x22_off            = -16,
 189     x21_off            = -15,
 190     x20_off            = -14,
 191     x19_off            = -13,
 192     x18_off            = -12,
 193     x9_off             = -11,
 194 
 195     call_wrapper_off   = -10,
 196     result_off         = -9,
 197     result_type_off    = -8,
 198     method_off         = -7,
 199     entry_point_off    = -6,
 200     parameters_off     = -5,
 201     parameter_size_off = -4,
 202     thread_off         = -3,
 203     fp_f               = -2,
 204     retaddr_off        = -1,
 205   };
 206 
 207   address generate_call_stub(address& return_address) {
 208     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 209            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 210            "adjust this code");
 211 
 212     StubCodeMark mark(this, "StubRoutines", "call_stub");
 213     address start = __ pc();
 214 
 215     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 216 
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     // install Java thread in global register now we have saved
 300     // whatever value it held
 301     __ mv(xthread, c_rarg7);
 302 
 303     // And method
 304     __ mv(xmethod, c_rarg3);
 305 
 306     // set up the heapbase register
 307     __ reinit_heapbase();
 308 
 309 #ifdef ASSERT
 310     // make sure we have no pending exceptions
 311     {
 312       Label L;
 313       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 314       __ beqz(t0, L);
 315       __ stop("StubRoutines::call_stub: entered with pending exception");
 316       __ BIND(L);
 317     }
 318 #endif
 319     // pass parameters if any
 320     __ mv(esp, sp);
 321     __ slli(t0, c_rarg6, LogBytesPerWord);
 322     __ sub(t0, sp, t0); // Move SP out of the way
 323     __ andi(sp, t0, -2 * wordSize);
 324 
 325     BLOCK_COMMENT("pass parameters if any");
 326     Label parameters_done;
 327     // parameter count is still in c_rarg6
 328     // and parameter pointer identifying param 1 is in c_rarg5
 329     __ beqz(c_rarg6, parameters_done);
 330 
 331     address loop = __ pc();
 332     __ ld(t0, c_rarg5, 0);
 333     __ addi(c_rarg5, c_rarg5, wordSize);
 334     __ addi(c_rarg6, c_rarg6, -1);
 335     __ push_reg(t0);
 336     __ bgtz(c_rarg6, loop);
 337 
 338     __ BIND(parameters_done);
 339 
 340     // call Java entry -- passing methdoOop, and current sp
 341     //      xmethod: Method*
 342     //      x30: sender sp
 343     BLOCK_COMMENT("call Java function");
 344     __ mv(x30, sp);
 345     __ jalr(c_rarg4);
 346 
 347     // save current address for use by exception handling code
 348 
 349     return_address = __ pc();
 350 
 351     // store result depending on type (everything that is not
 352     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 353     // n.b. this assumes Java returns an integral result in x10
 354     // and a floating result in j_farg0
 355     __ ld(j_rarg2, result);
 356     Label is_long, is_float, is_double, exit;
 357     __ ld(j_rarg1, result_type);
 358     __ li(t0, (u1)T_OBJECT);
 359     __ beq(j_rarg1, t0, is_long);
 360     __ li(t0, (u1)T_LONG);
 361     __ beq(j_rarg1, t0, is_long);
 362     __ li(t0, (u1)T_FLOAT);
 363     __ beq(j_rarg1, t0, is_float);
 364     __ li(t0, (u1)T_DOUBLE);
 365     __ beq(j_rarg1, t0, is_double);
 366 
 367     // handle T_INT case
 368     __ sw(x10, Address(j_rarg2));
 369 
 370     __ BIND(exit);
 371 
 372     // pop parameters
 373     __ addi(esp, fp, sp_after_call_off * wordSize);
 374 
 375 #ifdef ASSERT
 376     // verify that threads correspond
 377     {
 378       Label L, S;
 379       __ ld(t0, thread);
 380       __ bne(xthread, t0, S);
 381       __ get_thread(t0);
 382       __ beq(xthread, t0, L);
 383       __ BIND(S);
 384       __ stop("StubRoutines::call_stub: threads must correspond");
 385       __ BIND(L);
 386     }
 387 #endif
 388 
 389     // restore callee-save registers
 390     __ fld(f27, f27_save);
 391     __ fld(f26, f26_save);
 392     __ fld(f25, f25_save);
 393     __ fld(f24, f24_save);
 394     __ fld(f23, f23_save);
 395     __ fld(f22, f22_save);
 396     __ fld(f21, f21_save);
 397     __ fld(f20, f20_save);
 398     __ fld(f19, f19_save);
 399     __ fld(f18, f18_save);
 400     __ fld(f9,  f9_save);
 401     __ fld(f8,  f8_save);
 402 
 403     __ ld(x27, x27_save);
 404     __ ld(x26, x26_save);
 405     __ ld(x25, x25_save);
 406     __ ld(x24, x24_save);
 407     __ ld(x23, x23_save);
 408     __ ld(x22, x22_save);
 409     __ ld(x21, x21_save);
 410     __ ld(x20, x20_save);
 411     __ ld(x19, x19_save);
 412     __ ld(x18, x18_save);
 413 
 414     __ ld(x9, x9_save);
 415 
 416     __ ld(c_rarg0, call_wrapper);
 417     __ ld(c_rarg1, result);
 418     __ ld(c_rarg2, result_type);
 419     __ ld(c_rarg3, method);
 420     __ ld(c_rarg4, entry_point);
 421     __ ld(c_rarg5, parameters);
 422     __ ld(c_rarg6, parameter_size);
 423     __ ld(c_rarg7, thread);
 424 
 425     // leave frame and return to caller
 426     __ leave();
 427     __ ret();
 428 
 429     // handle return types different from T_INT
 430 
 431     __ BIND(is_long);
 432     __ sd(x10, Address(j_rarg2, 0));
 433     __ j(exit);
 434 
 435     __ BIND(is_float);
 436     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 437     __ j(exit);
 438 
 439     __ BIND(is_double);
 440     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 441     __ j(exit);
 442 
 443     return start;
 444   }
 445 
 446   // Return point for a Java call if there's an exception thrown in
 447   // Java code.  The exception is caught and transformed into a
 448   // pending exception stored in JavaThread that can be tested from
 449   // within the VM.
 450   //
 451   // Note: Usually the parameters are removed by the callee. In case
 452   // of an exception crossing an activation frame boundary, that is
 453   // not the case if the callee is compiled code => need to setup the
 454   // sp.
 455   //
 456   // x10: exception oop
 457 
 458   address generate_catch_exception() {
 459     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 460     address start = __ pc();
 461 
 462     // same as in generate_call_stub():
 463     const Address thread(fp, thread_off * wordSize);
 464 
 465 #ifdef ASSERT
 466     // verify that threads correspond
 467     {
 468       Label L, S;
 469       __ ld(t0, thread);
 470       __ bne(xthread, t0, S);
 471       __ get_thread(t0);
 472       __ beq(xthread, t0, L);
 473       __ bind(S);
 474       __ stop("StubRoutines::catch_exception: threads must correspond");
 475       __ bind(L);
 476     }
 477 #endif
 478 
 479     // set pending exception
 480     __ verify_oop(x10);
 481 
 482     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 483     __ mv(t0, (address)__FILE__);
 484     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 485     __ mv(t0, (int)__LINE__);
 486     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 487 
 488     // complete return to VM
 489     assert(StubRoutines::_call_stub_return_address != NULL,
 490            "_call_stub_return_address must have been generated before");
 491     __ j(StubRoutines::_call_stub_return_address);
 492 
 493     return start;
 494   }
 495 
 496   // Continuation point for runtime calls returning with a pending
 497   // exception.  The pending exception check happened in the runtime
 498   // or native call stub.  The pending exception in Thread is
 499   // converted into a Java-level exception.
 500   //
 501   // Contract with Java-level exception handlers:
 502   // x10: exception
 503   // x13: throwing pc
 504   //
 505   // NOTE: At entry of this stub, exception-pc must be in RA !!
 506 
 507   // NOTE: this is always used as a jump target within generated code
 508   // so it just needs to be generated code with no x86 prolog
 509 
 510   address generate_forward_exception() {
 511     StubCodeMark mark(this, "StubRoutines", "forward exception");
 512     address start = __ pc();
 513 
 514     // Upon entry, RA points to the return address returning into
 515     // Java (interpreted or compiled) code; i.e., the return address
 516     // becomes the throwing pc.
 517     //
 518     // Arguments pushed before the runtime call are still on the stack
 519     // but the exception handler will reset the stack pointer ->
 520     // ignore them.  A potential result in registers can be ignored as
 521     // well.
 522 
 523 #ifdef ASSERT
 524     // make sure this code is only executed if there is a pending exception
 525     {
 526       Label L;
 527       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 528       __ bnez(t0, L);
 529       __ stop("StubRoutines::forward exception: no pending exception (1)");
 530       __ bind(L);
 531     }
 532 #endif
 533 
 534     // compute exception handler into x9
 535 
 536     // call the VM to find the handler address associated with the
 537     // caller address. pass thread in x10 and caller pc (ret address)
 538     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 539     // the stack.
 540     __ mv(c_rarg1, ra);
 541     // ra will be trashed by the VM call so we move it to x9
 542     // (callee-saved) because we also need to pass it to the handler
 543     // returned by this call.
 544     __ mv(x9, ra);
 545     BLOCK_COMMENT("call exception_handler_for_return_address");
 546     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 547                          SharedRuntime::exception_handler_for_return_address),
 548                     xthread, c_rarg1);
 549     // we should not really care that ra is no longer the callee
 550     // address. we saved the value the handler needs in x9 so we can
 551     // just copy it to x13. however, the C2 handler will push its own
 552     // frame and then calls into the VM and the VM code asserts that
 553     // the PC for the frame above the handler belongs to a compiled
 554     // Java method. So, we restore ra here to satisfy that assert.
 555     __ mv(ra, x9);
 556     // setup x10 & x13 & clear pending exception
 557     __ mv(x13, x9);
 558     __ mv(x9, x10);
 559     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 560     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 561 
 562 #ifdef ASSERT
 563     // make sure exception is set
 564     {
 565       Label L;
 566       __ bnez(x10, L);
 567       __ stop("StubRoutines::forward exception: no pending exception (2)");
 568       __ bind(L);
 569     }
 570 #endif
 571 
 572     // continue at exception handler
 573     // x10: exception
 574     // x13: throwing pc
 575     // x9: exception handler
 576     __ verify_oop(x10);
 577     __ jr(x9);
 578 
 579     return start;
 580   }
 581 
 582   // Non-destructive plausibility checks for oops
 583   //
 584   // Arguments:
 585   //    x10: oop to verify
 586   //    t0: error message
 587   //
 588   // Stack after saving c_rarg3:
 589   //    [tos + 0]: saved c_rarg3
 590   //    [tos + 1]: saved c_rarg2
 591   //    [tos + 2]: saved ra
 592   //    [tos + 3]: saved t1
 593   //    [tos + 4]: saved x10
 594   //    [tos + 5]: saved t0
 595   address generate_verify_oop() {
 596 
 597     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 598     address start = __ pc();
 599 
 600     Label exit, error;
 601 
 602     __ push_reg(0x3000, sp);   // save c_rarg2 and c_rarg3
 603 
 604     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 605     __ ld(c_rarg3, Address(c_rarg2));
 606     __ add(c_rarg3, c_rarg3, 1);
 607     __ sd(c_rarg3, Address(c_rarg2));
 608 
 609     // object is in x10
 610     // make sure object is 'reasonable'
 611     __ beqz(x10, exit); // if obj is NULL it is OK
 612 
 613 #if INCLUDE_ZGC
 614     if (UseZGC) {
 615       // Check if mask is good.
 616       // verifies that ZAddressBadMask & x10 == 0
 617       __ ld(c_rarg3, Address(xthread, ZThreadLocalData::address_bad_mask_offset()));
 618       __ andr(c_rarg2, x10, c_rarg3);
 619       __ bnez(c_rarg2, error);
 620     }
 621 #endif
 622 
 623     // Check if the oop is in the right area of memory
 624     __ mv(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 625     __ andr(c_rarg2, x10, c_rarg3);
 626     __ mv(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 627 
 628     // Compare c_rarg2 and c_rarg3.
 629     __ bne(c_rarg2, c_rarg3, error);
 630 
 631     // make sure klass is 'reasonable', which is not zero.
 632     __ load_klass(x10, x10);  // get klass
 633     __ beqz(x10, error);      // if klass is NULL it is broken
 634 
 635     // return if everything seems ok
 636     __ bind(exit);
 637 
 638     __ pop_reg(0x3000, sp);   // pop c_rarg2 and c_rarg3
 639     __ ret();
 640 
 641     // handle errors
 642     __ bind(error);
 643     __ pop_reg(0x3000, sp);   // pop c_rarg2 and c_rarg3
 644 
 645     __ push_reg(RegSet::range(x0, x31), sp);
 646     // debug(char* msg, int64_t pc, int64_t regs[])
 647     __ mv(c_rarg0, t0);             // pass address of error message
 648     __ mv(c_rarg1, ra);             // pass return address
 649     __ mv(c_rarg2, sp);             // pass address of regs on stack
 650 #ifndef PRODUCT
 651     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 652 #endif
 653     BLOCK_COMMENT("call MacroAssembler::debug");
 654     int32_t offset = 0;
 655     __ movptr_with_offset(t0, CAST_FROM_FN_PTR(address, MacroAssembler::debug64), offset);
 656     __ jalr(x1, t0, offset);
 657     __ ebreak();
 658 
 659     return start;
 660   }
 661 
 662   // The inner part of zero_words().
 663   //
 664   // Inputs:
 665   // x28: the HeapWord-aligned base address of an array to zero.
 666   // x29: the count in HeapWords, x29 > 0.
 667   //
 668   // Returns x28 and x29, adjusted for the caller to clear.
 669   // x28: the base address of the tail of words left to clear.
 670   // x29: the number of words in the tail.
 671   //      x29 < MacroAssembler::zero_words_block_size.
 672 
 673   address generate_zero_blocks() {
 674     Label done;
 675 
 676     const Register base = x28, cnt = x29;
 677 
 678     __ align(CodeEntryAlignment);
 679     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 680     address start = __ pc();
 681 
 682     {
 683       // Clear the remaining blocks.
 684       Label loop;
 685       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 686       __ bltz(cnt, done);
 687       __ bind(loop);
 688       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 689         __ sd(zr, Address(base, 0));
 690         __ add(base, base, 8);
 691       }
 692       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 693       __ bgez(cnt, loop);
 694       __ bind(done);
 695       __ add(cnt, cnt, MacroAssembler::zero_words_block_size);
 696     }
 697 
 698     __ ret();
 699 
 700     return start;
 701   }
 702 
 703   typedef enum {
 704     copy_forwards = 1,
 705     copy_backwards = -1
 706   } copy_direction;
 707 
 708   // Bulk copy of blocks of 8 words.
 709   //
 710   // count is a count of words.
 711   //
 712   // Precondition: count >= 8
 713   //
 714   // Postconditions:
 715   //
 716   // The least significant bit of count contains the remaining count
 717   // of words to copy.  The rest of count is trash.
 718   //
 719   // s and d are adjusted to point to the remaining words to copy
 720   //
 721   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 722                            copy_direction direction) {
 723     int unit = wordSize * direction;
 724     int bias = wordSize;
 725 
 726     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 727       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 728 
 729     const Register stride = x30;
 730 
 731     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 732       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 733     assert_different_registers(s, d, count, t0);
 734 
 735     Label again, drain;
 736     const char* stub_name = NULL;
 737     if (direction == copy_forwards) {
 738       stub_name = "forward_copy_longs";
 739     } else {
 740       stub_name = "backward_copy_longs";
 741     }
 742     StubCodeMark mark(this, "StubRoutines", stub_name);
 743     __ align(CodeEntryAlignment);
 744     __ bind(start);
 745 
 746     if (direction == copy_forwards) {
 747       __ sub(s, s, bias);
 748       __ sub(d, d, bias);
 749     }
 750 
 751 #ifdef ASSERT
 752     // Make sure we are never given < 8 words
 753     {
 754       Label L;
 755 
 756       __ li(t0, 8);
 757       __ bge(count, t0, L);
 758       __ stop("genrate_copy_longs called with < 8 words");
 759       __ bind(L);
 760     }
 761 #endif
 762 
 763     __ ld(tmp_reg0, Address(s, 1 * unit));
 764     __ ld(tmp_reg1, Address(s, 2 * unit));
 765     __ ld(tmp_reg2, Address(s, 3 * unit));
 766     __ ld(tmp_reg3, Address(s, 4 * unit));
 767     __ ld(tmp_reg4, Address(s, 5 * unit));
 768     __ ld(tmp_reg5, Address(s, 6 * unit));
 769     __ ld(tmp_reg6, Address(s, 7 * unit));
 770     __ ld(tmp_reg7, Address(s, 8 * unit));
 771     __ addi(s, s, 8 * unit);
 772 
 773     __ sub(count, count, 16);
 774     __ bltz(count, drain);
 775 
 776     __ bind(again);
 777 
 778     __ sd(tmp_reg0, Address(d, 1 * unit));
 779     __ sd(tmp_reg1, Address(d, 2 * unit));
 780     __ sd(tmp_reg2, Address(d, 3 * unit));
 781     __ sd(tmp_reg3, Address(d, 4 * unit));
 782     __ sd(tmp_reg4, Address(d, 5 * unit));
 783     __ sd(tmp_reg5, Address(d, 6 * unit));
 784     __ sd(tmp_reg6, Address(d, 7 * unit));
 785     __ sd(tmp_reg7, Address(d, 8 * unit));
 786 
 787     __ ld(tmp_reg0, Address(s, 1 * unit));
 788     __ ld(tmp_reg1, Address(s, 2 * unit));
 789     __ ld(tmp_reg2, Address(s, 3 * unit));
 790     __ ld(tmp_reg3, Address(s, 4 * unit));
 791     __ ld(tmp_reg4, Address(s, 5 * unit));
 792     __ ld(tmp_reg5, Address(s, 6 * unit));
 793     __ ld(tmp_reg6, Address(s, 7 * unit));
 794     __ ld(tmp_reg7, Address(s, 8 * unit));
 795 
 796     __ addi(s, s, 8 * unit);
 797     __ addi(d, d, 8 * unit);
 798 
 799     __ sub(count, count, 8);
 800     __ bgez(count, again);
 801 
 802     // Drain
 803     __ bind(drain);
 804 
 805     __ sd(tmp_reg0, Address(d, 1 * unit));
 806     __ sd(tmp_reg1, Address(d, 2 * unit));
 807     __ sd(tmp_reg2, Address(d, 3 * unit));
 808     __ sd(tmp_reg3, Address(d, 4 * unit));
 809     __ sd(tmp_reg4, Address(d, 5 * unit));
 810     __ sd(tmp_reg5, Address(d, 6 * unit));
 811     __ sd(tmp_reg6, Address(d, 7 * unit));
 812     __ sd(tmp_reg7, Address(d, 8 * unit));
 813     __ addi(d, d, 8 * unit);
 814 
 815     {
 816       Label L1, L2;
 817       __ andi(t0, count, 4);
 818       __ beqz(t0, L1);
 819 
 820       __ ld(tmp_reg0, Address(s, 1 * unit));
 821       __ ld(tmp_reg1, Address(s, 2 * unit));
 822       __ ld(tmp_reg2, Address(s, 3 * unit));
 823       __ ld(tmp_reg3, Address(s, 4 * unit));
 824       __ addi(s, s, 4 * unit);
 825 
 826       __ sd(tmp_reg0, Address(d, 1 * unit));
 827       __ sd(tmp_reg1, Address(d, 2 * unit));
 828       __ sd(tmp_reg2, Address(d, 3 * unit));
 829       __ sd(tmp_reg3, Address(d, 4 * unit));
 830       __ addi(d, d, 4 * unit);
 831 
 832       __ bind(L1);
 833 
 834       if (direction == copy_forwards) {
 835         __ addi(s, s, bias);
 836         __ addi(d, d, bias);
 837       }
 838 
 839       __ andi(t0, count, 2);
 840       __ beqz(t0, L2);
 841       if (direction == copy_backwards) {
 842         __ addi(s, s, 2 * unit);
 843         __ ld(tmp_reg0, Address(s));
 844         __ ld(tmp_reg1, Address(s, wordSize));
 845         __ addi(d, d, 2 * unit);
 846         __ sd(tmp_reg0, Address(d));
 847         __ sd(tmp_reg1, Address(d, wordSize));
 848       } else {
 849         __ ld(tmp_reg0, Address(s));
 850         __ ld(tmp_reg1, Address(s, wordSize));
 851         __ addi(s, s, 2 * unit);
 852         __ sd(tmp_reg0, Address(d));
 853         __ sd(tmp_reg1, Address(d, wordSize));
 854         __ addi(d, d, 2 * unit);
 855       }
 856       __ bind(L2);
 857     }
 858 
 859     __ ret();
 860   }
 861 
 862   Label copy_f, copy_b;
 863 
 864   // All-singing all-dancing memory copy.
 865   //
 866   // Copy count units of memory from s to d.  The size of a unit is
 867   // step, which can be positive or negative depending on the direction
 868   // of copy.  If is_aligned is false, we align the source address.
 869   //
 870   /*
 871    * if (is_aligned) {
 872    *   if (count >= 32)
 873    *     goto copy32_loop;
 874    *   if (count >= 8)
 875    *     goto copy8_loop;
 876    *   goto copy_small;
 877    * }
 878    * bool is_backwards = step < 0;
 879    * int granularity = uabs(step);
 880    * count = count  *  granularity;   * count bytes
 881    *
 882    * if (is_backwards) {
 883    *   s += count;
 884    *   d += count;
 885    * }
 886    *
 887    * count limit maybe greater than 16, for better performance
 888    * if (count < 16) {
 889    *   goto copy_small;
 890    * }
 891    *
 892    * if ((dst % 8) == (src % 8)) {
 893    *   aligned;
 894    *   goto copy_big;
 895    * }
 896    *
 897    * copy_big:
 898    * if the amount to copy is more than (or equal to) 32 bytes goto copy32_loop
 899    *  else goto copy8_loop
 900    * copy_small:
 901    *   load element one by one;
 902    * done;
 903    */
 904 
 905   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 906 
 907   void copy_memory_v(Register s, Register d, Register count, Register tmp, int step) {
 908     bool is_backward = step < 0;
 909     int granularity = uabs(step);
 910 
 911     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 912     assert_different_registers(s, d, cnt, vl, tmp, tmp1, tmp2);
 913     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 914     Label loop_forward, loop_backward, done;
 915 
 916     __ mv(dst, d);
 917     __ mv(src, s);
 918     __ mv(cnt, count);
 919 
 920     __ bind(loop_forward);
 921     __ vsetvli(vl, cnt, sew, Assembler::m8);
 922     if (is_backward) {
 923       __ bne(vl, cnt, loop_backward);
 924     }
 925 
 926     __ vlex_v(v0, src, sew);
 927     __ sub(cnt, cnt, vl);
 928     __ slli(vl, vl, (int)sew);
 929     __ add(src, src, vl);
 930 
 931     __ vsex_v(v0, dst, sew);
 932     __ add(dst, dst, vl);
 933     __ bnez(cnt, loop_forward);
 934 
 935     if (is_backward) {
 936       __ j(done);
 937 
 938       __ bind(loop_backward);
 939       __ sub(tmp, cnt, vl);
 940       __ slli(tmp, tmp, sew);
 941       __ add(tmp1, s, tmp);
 942       __ vlex_v(v0, tmp1, sew);
 943       __ add(tmp2, d, tmp);
 944       __ vsex_v(v0, tmp2, sew);
 945       __ sub(cnt, cnt, vl);
 946       __ bnez(cnt, loop_forward);
 947       __ bind(done);
 948     }
 949   }
 950 
 951   void copy_memory(bool is_aligned, Register s, Register d,
 952                    Register count, Register tmp, int step) {
 953     if (UseRVV) {
 954       return copy_memory_v(s, d, count, tmp, step);
 955     }
 956 
 957     bool is_backwards = step < 0;
 958     int granularity = uabs(step);
 959 
 960     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 961 
 962     Label same_aligned;
 963     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 964 
 965     copy_insn ld_arr = NULL, st_arr = NULL;
 966     switch (granularity) {
 967       case 1 :
 968         ld_arr = (copy_insn)&MacroAssembler::lbu;
 969         st_arr = (copy_insn)&MacroAssembler::sb;
 970         break;
 971       case 2 :
 972         ld_arr = (copy_insn)&MacroAssembler::lhu;
 973         st_arr = (copy_insn)&MacroAssembler::sh;
 974         break;
 975       case 4 :
 976         ld_arr = (copy_insn)&MacroAssembler::lwu;
 977         st_arr = (copy_insn)&MacroAssembler::sw;
 978         break;
 979       case 8 :
 980         ld_arr = (copy_insn)&MacroAssembler::ld;
 981         st_arr = (copy_insn)&MacroAssembler::sd;
 982         break;
 983       default :
 984         ShouldNotReachHere();
 985     }
 986 
 987     __ beqz(count, done);
 988     __ slli(cnt, count, exact_log2(granularity));
 989     if (is_backwards) {
 990       __ add(src, s, cnt);
 991       __ add(dst, d, cnt);
 992     } else {
 993       __ mv(src, s);
 994       __ mv(dst, d);
 995     }
 996 
 997     if (is_aligned) {
 998       __ addi(tmp, cnt, -32);
 999       __ bgez(tmp, copy32_loop);
1000       __ addi(tmp, cnt, -8);
1001       __ bgez(tmp, copy8_loop);
1002       __ j(copy_small);
1003     } else {
1004       __ mv(tmp, 16);
1005       __ blt(cnt, tmp, copy_small);
1006 
1007       __ xorr(tmp, src, dst);
1008       __ andi(tmp, tmp, 0b111);
1009       __ bnez(tmp, copy_small);
1010 
1011       __ bind(same_aligned);
1012       __ andi(tmp, src, 0b111);
1013       __ beqz(tmp, copy_big);
1014       if (is_backwards) {
1015         __ addi(src, src, step);
1016         __ addi(dst, dst, step);
1017       }
1018       (_masm->*ld_arr)(tmp3, Address(src), t0);
1019       (_masm->*st_arr)(tmp3, Address(dst), t0);
1020       if (!is_backwards) {
1021         __ addi(src, src, step);
1022         __ addi(dst, dst, step);
1023       }
1024       __ addi(cnt, cnt, -granularity);
1025       __ beqz(cnt, done);
1026       __ j(same_aligned);
1027 
1028       __ bind(copy_big);
1029       __ mv(tmp, 32);
1030       __ blt(cnt, tmp, copy8_loop);
1031     }
1032     __ bind(copy32_loop);
1033     if (is_backwards) {
1034       __ addi(src, src, -wordSize * 4);
1035       __ addi(dst, dst, -wordSize * 4);
1036     }
1037     // we first load 32 bytes, then write it, so the direction here doesn't matter
1038     __ ld(tmp3, Address(src));
1039     __ ld(tmp4, Address(src, 8));
1040     __ ld(tmp5, Address(src, 16));
1041     __ ld(tmp6, Address(src, 24));
1042     __ sd(tmp3, Address(dst));
1043     __ sd(tmp4, Address(dst, 8));
1044     __ sd(tmp5, Address(dst, 16));
1045     __ sd(tmp6, Address(dst, 24));
1046 
1047     if (!is_backwards) {
1048       __ addi(src, src, wordSize * 4);
1049       __ addi(dst, dst, wordSize * 4);
1050     }
1051     __ addi(tmp, cnt, -(32 + wordSize * 4));
1052     __ addi(cnt, cnt, -wordSize * 4);
1053     __ bgez(tmp, copy32_loop); // cnt >= 32, do next loop
1054 
1055     __ beqz(cnt, done); // if that's all - done
1056 
1057     __ addi(tmp, cnt, -8); // if not - copy the reminder
1058     __ bltz(tmp, copy_small); // cnt < 8, go to copy_small, else fall throught to copy8_loop
1059 
1060     __ bind(copy8_loop);
1061     if (is_backwards) {
1062       __ addi(src, src, -wordSize);
1063       __ addi(dst, dst, -wordSize);
1064     }
1065     __ ld(tmp3, Address(src));
1066     __ sd(tmp3, Address(dst));
1067     if (!is_backwards) {
1068       __ addi(src, src, wordSize);
1069       __ addi(dst, dst, wordSize);
1070     }
1071     __ addi(tmp, cnt, -(8 + wordSize));
1072     __ addi(cnt, cnt, -wordSize);
1073     __ bgez(tmp, copy8_loop); // cnt >= 8, do next loop
1074 
1075     __ beqz(cnt, done); // if that's all - done
1076 
1077     __ bind(copy_small);
1078     if (is_backwards) {
1079       __ addi(src, src, step);
1080       __ addi(dst, dst, step);
1081     }
1082     (_masm->*ld_arr)(tmp3, Address(src), t0);
1083     (_masm->*st_arr)(tmp3, Address(dst), t0);
1084     if (!is_backwards) {
1085       __ addi(src, src, step);
1086       __ addi(dst, dst, step);
1087     }
1088     __ addi(cnt, cnt, -granularity);
1089     __ bgtz(cnt, copy_small);
1090 
1091     __ bind(done);
1092   }
1093 
1094   // Scan over array at a for count oops, verifying each one.
1095   // Preserves a and count, clobbers t0 and t1.
1096   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1097     Label loop, end;
1098     __ mv(t1, zr);
1099     __ slli(t0, count, exact_log2(size));
1100     __ bind(loop);
1101     __ bgeu(t1, t0, end);
1102 
1103     __ add(temp, a, t1);
1104     if (size == (size_t)wordSize) {
1105       __ ld(temp, Address(temp, 0));
1106       __ verify_oop(temp);
1107     } else {
1108       __ lwu(temp, Address(temp, 0));
1109       __ decode_heap_oop(temp); // calls verify_oop
1110     }
1111     __ add(t1, t1, size);
1112     __ j(loop);
1113     __ bind(end);
1114   }
1115 
1116   // Arguments:
1117   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1118   //             ignored
1119   //   is_oop  - true => oop array, so generate store check code
1120   //   name    - stub name string
1121   //
1122   // Inputs:
1123   //   c_rarg0   - source array address
1124   //   c_rarg1   - destination array address
1125   //   c_rarg2   - element count, treated as ssize_t, can be zero
1126   //
1127   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1128   // the hardware handle it.  The two dwords within qwords that span
1129   // cache line boundaries will still be loaded and stored atomicly.
1130   //
1131   // Side Effects:
1132   //   disjoint_int_copy_entry is set to the no-overlap entry point
1133   //   used by generate_conjoint_int_oop_copy().
1134   //
1135   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry,
1136                                  const char* name, bool dest_uninitialized = false) {
1137     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1138     RegSet saved_reg = RegSet::of(s, d, count);
1139     __ align(CodeEntryAlignment);
1140     StubCodeMark mark(this, "StubRoutines", name);
1141     address start = __ pc();
1142     __ enter();
1143 
1144     if (entry != NULL) {
1145       *entry = __ pc();
1146       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1147       BLOCK_COMMENT("Entry:");
1148     }
1149 
1150     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1151     if (dest_uninitialized) {
1152       decorators |= IS_DEST_UNINITIALIZED;
1153     }
1154     if (aligned) {
1155       decorators |= ARRAYCOPY_ALIGNED;
1156     }
1157 
1158     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1159     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1160 
1161     if (is_oop) {
1162       // save regs before copy_memory
1163       __ push_reg(RegSet::of(d, count), sp);
1164     }
1165 
1166     {
1167       // UnsafeCopyMemory page error: continue after ucm
1168       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1169       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1170       copy_memory(aligned, s, d, count, t0, size);
1171     }
1172 
1173     if (is_oop) {
1174       __ pop_reg(RegSet::of(d, count), sp);
1175       if (VerifyOops) {
1176         verify_oop_array(size, d, count, t2);
1177       }
1178     }
1179 
1180     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1181 
1182     __ leave();
1183     __ mv(x10, zr); // return 0
1184     __ ret();
1185     return start;
1186   }
1187 
1188   // Arguments:
1189   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1190   //             ignored
1191   //   is_oop  - true => oop array, so generate store check code
1192   //   name    - stub name string
1193   //
1194   // Inputs:
1195   //   c_rarg0   - source array address
1196   //   c_rarg1   - destination array address
1197   //   c_rarg2   - element count, treated as ssize_t, can be zero
1198   //
1199   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1200   // the hardware handle it.  The two dwords within qwords that span
1201   // cache line boundaries will still be loaded and stored atomicly.
1202   //
1203   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1204                                  address* entry, const char* name,
1205                                  bool dest_uninitialized = false) {
1206     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1207     RegSet saved_regs = RegSet::of(s, d, count);
1208     StubCodeMark mark(this, "StubRoutines", name);
1209     address start = __ pc();
1210     __ enter();
1211 
1212     if (entry != NULL) {
1213       *entry = __ pc();
1214       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1215       BLOCK_COMMENT("Entry:");
1216     }
1217 
1218     // use fwd copy when (d-s) above_equal (count*size)
1219     __ sub(t0, d, s);
1220     __ slli(t1, count, exact_log2(size));
1221     __ bgeu(t0, t1, nooverlap_target);
1222 
1223     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1224     if (dest_uninitialized) {
1225       decorators |= IS_DEST_UNINITIALIZED;
1226     }
1227     if (aligned) {
1228       decorators |= ARRAYCOPY_ALIGNED;
1229     }
1230 
1231     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1232     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1233 
1234     if (is_oop) {
1235       // save regs before copy_memory
1236       __ push_reg(RegSet::of(d, count), sp);
1237     }
1238 
1239     {
1240       // UnsafeCopyMemory page error: continue after ucm
1241       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1242       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1243       copy_memory(aligned, s, d, count, t0, -size);
1244     }
1245 
1246     if (is_oop) {
1247       __ pop_reg(RegSet::of(d, count), sp);
1248       if (VerifyOops) {
1249         verify_oop_array(size, d, count, t2);
1250       }
1251     }
1252     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1253     __ leave();
1254     __ mv(x10, zr); // return 0
1255     __ ret();
1256     return start;
1257   }
1258 
1259   // Arguments:
1260   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1261   //             ignored
1262   //   name    - stub name string
1263   //
1264   // Inputs:
1265   //   c_rarg0   - source array address
1266   //   c_rarg1   - destination array address
1267   //   c_rarg2   - element count, treated as ssize_t, can be zero
1268   //
1269   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1270   // we let the hardware handle it.  The one to eight bytes within words,
1271   // dwords or qwords that span cache line boundaries will still be loaded
1272   // and stored atomically.
1273   //
1274   // Side Effects:
1275   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1276   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1277   // we let the hardware handle it.  The one to eight bytes within words,
1278   // dwords or qwords that span cache line boundaries will still be loaded
1279   // and stored atomically.
1280   //
1281   // Side Effects:
1282   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1283   //   used by generate_conjoint_byte_copy().
1284   //
1285   address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) {
1286     const bool not_oop = false;
1287     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1288   }
1289 
1290   // Arguments:
1291   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1292   //             ignored
1293   //   name    - stub name string
1294   //
1295   // Inputs:
1296   //   c_rarg0   - source array address
1297   //   c_rarg1   - destination array address
1298   //   c_rarg2   - element count, treated as ssize_t, can be zero
1299   //
1300   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1301   // we let the hardware handle it.  The one to eight bytes within words,
1302   // dwords or qwords that span cache line boundaries will still be loaded
1303   // and stored atomically.
1304   //
1305   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1306                                       address* entry, const char* name) {
1307     const bool not_oop = false;
1308     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1309   }
1310 
1311   // Arguments:
1312   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1313   //             ignored
1314   //   name    - stub name string
1315   //
1316   // Inputs:
1317   //   c_rarg0   - source array address
1318   //   c_rarg1   - destination array address
1319   //   c_rarg2   - element count, treated as ssize_t, can be zero
1320   //
1321   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1322   // let the hardware handle it.  The two or four words within dwords
1323   // or qwords that span cache line boundaries will still be loaded
1324   // and stored atomically.
1325   //
1326   // Side Effects:
1327   //   disjoint_short_copy_entry is set to the no-overlap entry point
1328   //   used by generate_conjoint_short_copy().
1329   //
1330   address generate_disjoint_short_copy(bool aligned,
1331                                        address* entry, const char* name) {
1332     const bool not_oop = false;
1333     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1334   }
1335 
1336   // Arguments:
1337   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1338   //             ignored
1339   //   name    - stub name string
1340   //
1341   // Inputs:
1342   //   c_rarg0   - source array address
1343   //   c_rarg1   - destination array address
1344   //   c_rarg2   - element count, treated as ssize_t, can be zero
1345   //
1346   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1347   // let the hardware handle it.  The two or four words within dwords
1348   // or qwords that span cache line boundaries will still be loaded
1349   // and stored atomically.
1350   //
1351   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1352                                        address* entry, const char* name) {
1353     const bool not_oop = false;
1354     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1355   }
1356 
1357   // Arguments:
1358   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1359   //             ignored
1360   //   name    - stub name string
1361   //
1362   // Inputs:
1363   //   c_rarg0   - source array address
1364   //   c_rarg1   - destination array address
1365   //   c_rarg2   - element count, treated as ssize_t, can be zero
1366   //
1367   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1368   // the hardware handle it.  The two dwords within qwords that span
1369   // cache line boundaries will still be loaded and stored atomicly.
1370   //
1371   // Side Effects:
1372   //   disjoint_int_copy_entry is set to the no-overlap entry point
1373   //   used by generate_conjoint_int_oop_copy().
1374   //
1375   address generate_disjoint_int_copy(bool aligned, address* entry,
1376                                      const char* name, bool dest_uninitialized = false) {
1377     const bool not_oop = false;
1378     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1379   }
1380 
1381   // Arguments:
1382   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1383   //             ignored
1384   //   name    - stub name string
1385   //
1386   // Inputs:
1387   //   c_rarg0   - source array address
1388   //   c_rarg1   - destination array address
1389   //   c_rarg2   - element count, treated as ssize_t, can be zero
1390   //
1391   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1392   // the hardware handle it.  The two dwords within qwords that span
1393   // cache line boundaries will still be loaded and stored atomicly.
1394   //
1395   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1396                                      address* entry, const char* name,
1397                                      bool dest_uninitialized = false) {
1398     const bool not_oop = false;
1399     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1400   }
1401 
1402 
1403   // Arguments:
1404   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1405   //             ignored
1406   //   name    - stub name string
1407   //
1408   // Inputs:
1409   //   c_rarg0   - source array address
1410   //   c_rarg1   - destination array address
1411   //   c_rarg2   - element count, treated as size_t, can be zero
1412   //
1413   // Side Effects:
1414   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1415   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1416   //
1417   address generate_disjoint_long_copy(bool aligned, address* entry,
1418                                       const char* name, bool dest_uninitialized = false) {
1419     const bool not_oop = false;
1420     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1421   }
1422 
1423   // Arguments:
1424   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1425   //             ignored
1426   //   name    - stub name string
1427   //
1428   // Inputs:
1429   //   c_rarg0   - source array address
1430   //   c_rarg1   - destination array address
1431   //   c_rarg2   - element count, treated as size_t, can be zero
1432   //
1433   address generate_conjoint_long_copy(bool aligned,
1434                                       address nooverlap_target, address* entry,
1435                                       const char* name, bool dest_uninitialized = false) {
1436     const bool not_oop = false;
1437     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1438   }
1439 
1440   // Arguments:
1441   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1442   //             ignored
1443   //   name    - stub name string
1444   //
1445   // Inputs:
1446   //   c_rarg0   - source array address
1447   //   c_rarg1   - destination array address
1448   //   c_rarg2   - element count, treated as size_t, can be zero
1449   //
1450   // Side Effects:
1451   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1452   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1453   //
1454   address generate_disjoint_oop_copy(bool aligned, address* entry,
1455                                      const char* name, bool dest_uninitialized) {
1456     const bool is_oop = true;
1457     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1458     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1459   }
1460 
1461   // Arguments:
1462   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1463   //             ignored
1464   //   name    - stub name string
1465   //
1466   // Inputs:
1467   //   c_rarg0   - source array address
1468   //   c_rarg1   - destination array address
1469   //   c_rarg2   - element count, treated as size_t, can be zero
1470   //
1471   address generate_conjoint_oop_copy(bool aligned,
1472                                      address nooverlap_target, address* entry,
1473                                      const char* name, bool dest_uninitialized) {
1474     const bool is_oop = true;
1475     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1476     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1477                                   name, dest_uninitialized);
1478   }
1479 
1480   // Helper for generating a dynamic type check.
1481   // Smashes t0, t1.
1482   void generate_type_check(Register sub_klass,
1483                            Register super_check_offset,
1484                            Register super_klass,
1485                            Label& L_success) {
1486     assert_different_registers(sub_klass, super_check_offset, super_klass);
1487 
1488     BLOCK_COMMENT("type_check:");
1489 
1490     Label L_miss;
1491 
1492     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, super_check_offset);
1493     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1494 
1495     // Fall through on failure!
1496     __ BIND(L_miss);
1497   }
1498 
1499   //
1500   //  Generate checkcasting array copy stub
1501   //
1502   //  Input:
1503   //    c_rarg0   - source array address
1504   //    c_rarg1   - destination array address
1505   //    c_rarg2   - element count, treated as ssize_t, can be zero
1506   //    c_rarg3   - size_t ckoff (super_check_offset)
1507   //    c_rarg4   - oop ckval (super_klass)
1508   //
1509   //  Output:
1510   //    x10 ==  0  -  success
1511   //    x10 == -1^K - failure, where K is partial transfer count
1512   //
1513   address generate_checkcast_copy(const char* name, address* entry,
1514                                   bool dest_uninitialized = false) {
1515     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1516 
1517     // Input registers (after setup_arg_regs)
1518     const Register from        = c_rarg0;   // source array address
1519     const Register to          = c_rarg1;   // destination array address
1520     const Register count       = c_rarg2;   // elementscount
1521     const Register ckoff       = c_rarg3;   // super_check_offset
1522     const Register ckval       = c_rarg4;   // super_klass
1523 
1524     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1525     RegSet wb_post_saved_regs  = RegSet::of(count);
1526 
1527     // Registers used as temps (x7, x9, x18 are save-on-entry)
1528     const Register count_save  = x19;       // orig elementscount
1529     const Register start_to    = x18;       // destination array start address
1530     const Register copied_oop  = x7;        // actual oop copied
1531     const Register r9_klass    = x9;        // oop._klass
1532 
1533     //---------------------------------------------------------------
1534     // Assembler stub will be used for this call to arraycopy
1535     // if the two arrays are subtypes of Object[] but the
1536     // destination array type is not equal to or a supertype
1537     // of the source type.  Each element must be separately
1538     // checked.
1539 
1540     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1541                                copied_oop, r9_klass, count_save);
1542 
1543     __ align(CodeEntryAlignment);
1544     StubCodeMark mark(this, "StubRoutines", name);
1545     address start = __ pc();
1546 
1547     __ enter(); // required for proper stackwalking of RuntimeStub frame
1548 
1549     // Caller of this entry point must set up the argument registers.
1550     if (entry != NULL) {
1551       *entry = __ pc();
1552       BLOCK_COMMENT("Entry:");
1553     }
1554 
1555     // Empty array:  Nothing to do
1556     __ beqz(count, L_done);
1557 
1558     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1559 
1560 #ifdef ASSERT
1561     BLOCK_COMMENT("assert consistent ckoff/ckval");
1562     // The ckoff and ckval must be mutually consistent,
1563     // even though caller generates both.
1564     { Label L;
1565       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1566       __ lwu(start_to, Address(ckval, sco_offset));
1567       __ beq(ckoff, start_to, L);
1568       __ stop("super_check_offset inconsistent");
1569       __ bind(L);
1570     }
1571 #endif //ASSERT
1572 
1573     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1574     bool is_oop = true;
1575     if (dest_uninitialized) {
1576       decorators |= IS_DEST_UNINITIALIZED;
1577     }
1578 
1579     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1580     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1581 
1582     // save the original count
1583     __ mv(count_save, count);
1584 
1585     // Copy from low to high addresses
1586     __ mv(start_to, to);              // Save destination array start address
1587     __ j(L_load_element);
1588 
1589     // ======== begin loop ========
1590     // (Loop is rotated; its entry is L_load_element.)
1591     // Loop control:
1592     //   for count to 0 do
1593     //     copied_oop = load_heap_oop(from++)
1594     //     ... generate_type_check ...
1595     //     store_heap_oop(to++, copied_oop)
1596     //   end
1597 
1598     __ align(OptoLoopAlignment);
1599 
1600     __ BIND(L_store_element);
1601     __ store_heap_oop(Address(to, 0), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1602     __ add(to, to, UseCompressedOops ? 4 : 8);
1603     __ sub(count, count, 1);
1604     __ beqz(count, L_do_card_marks);
1605 
1606     // ======== loop entry is here ========
1607     __ BIND(L_load_element);
1608     __ load_heap_oop(copied_oop, Address(from, 0), noreg, noreg, AS_RAW); // load the oop
1609     __ add(from, from, UseCompressedOops ? 4 : 8);
1610     __ beqz(copied_oop, L_store_element);
1611 
1612     __ load_klass(r9_klass, copied_oop);// query the object klass
1613     generate_type_check(r9_klass, ckoff, ckval, L_store_element);
1614     // ======== end loop ========
1615 
1616     // It was a real error; we must depend on the caller to finish the job.
1617     // Register count = remaining oops, count_orig = total oops.
1618     // Emit GC store barriers for the oops we have copied and report
1619     // their number to the caller.
1620 
1621     __ sub(count, count_save, count);     // K = partially copied oop count
1622     __ xori(count, count, -1);                   // report (-1^K) to caller
1623     __ beqz(count, L_done_pop);
1624 
1625     __ BIND(L_do_card_marks);
1626     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
1627 
1628     __ bind(L_done_pop);
1629     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1630     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1631 
1632     __ bind(L_done);
1633     __ mv(x10, count);
1634     __ leave();
1635     __ ret();
1636 
1637     return start;
1638   }
1639 
1640   // Perform range checks on the proposed arraycopy.
1641   // Kills temp, but nothing else.
1642   // Also, clean the sign bits of src_pos and dst_pos.
1643   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1644                               Register src_pos, // source position (c_rarg1)
1645                               Register dst,     // destination array oo (c_rarg2)
1646                               Register dst_pos, // destination position (c_rarg3)
1647                               Register length,
1648                               Register temp,
1649                               Label& L_failed) {
1650     BLOCK_COMMENT("arraycopy_range_checks:");
1651 
1652     assert_different_registers(t0, temp);
1653 
1654     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1655     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1656     __ addw(temp, length, src_pos);
1657     __ bgtu(temp, t0, L_failed);
1658 
1659     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1660     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1661     __ addw(temp, length, dst_pos);
1662     __ bgtu(temp, t0, L_failed);
1663 
1664     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1665     __ zero_extend(src_pos, src_pos, 32);
1666     __ zero_extend(dst_pos, dst_pos, 32);
1667 
1668     BLOCK_COMMENT("arraycopy_range_checks done");
1669   }
1670 
1671   //
1672   //  Generate 'unsafe' array copy stub
1673   //  Though just as safe as the other stubs, it takes an unscaled
1674   //  size_t argument instead of an element count.
1675   //
1676   //  Input:
1677   //    c_rarg0   - source array address
1678   //    c_rarg1   - destination array address
1679   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1680   //
1681   // Examines the alignment of the operands and dispatches
1682   // to a long, int, short, or byte copy loop.
1683   //
1684   address generate_unsafe_copy(const char* name,
1685                                address byte_copy_entry,
1686                                address short_copy_entry,
1687                                address int_copy_entry,
1688                                address long_copy_entry) {
1689     assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL &&
1690                 int_copy_entry != NULL && long_copy_entry != NULL);
1691     Label L_long_aligned, L_int_aligned, L_short_aligned;
1692     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1693 
1694     __ align(CodeEntryAlignment);
1695     StubCodeMark mark(this, "StubRoutines", name);
1696     address start = __ pc();
1697     __ enter(); // required for proper stackwalking of RuntimeStub frame
1698 
1699     // bump this on entry, not on exit:
1700     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1701 
1702     __ orr(t0, s, d);
1703     __ orr(t0, t0, count);
1704 
1705     __ andi(t0, t0, BytesPerLong - 1);
1706     __ beqz(t0, L_long_aligned);
1707     __ andi(t0, t0, BytesPerInt - 1);
1708     __ beqz(t0, L_int_aligned);
1709     __ andi(t0, t0, 1);
1710     __ beqz(t0, L_short_aligned);
1711     __ j(RuntimeAddress(byte_copy_entry));
1712 
1713     __ BIND(L_short_aligned);
1714     __ srli(count, count, LogBytesPerShort);  // size => short_count
1715     __ j(RuntimeAddress(short_copy_entry));
1716     __ BIND(L_int_aligned);
1717     __ srli(count, count, LogBytesPerInt);    // size => int_count
1718     __ j(RuntimeAddress(int_copy_entry));
1719     __ BIND(L_long_aligned);
1720     __ srli(count, count, LogBytesPerLong);   // size => long_count
1721     __ j(RuntimeAddress(long_copy_entry));
1722 
1723     return start;
1724   }
1725 
1726   //
1727   //  Generate generic array copy stubs
1728   //
1729   //  Input:
1730   //    c_rarg0    -  src oop
1731   //    c_rarg1    -  src_pos (32-bits)
1732   //    c_rarg2    -  dst oop
1733   //    c_rarg3    -  dst_pos (32-bits)
1734   //    c_rarg4    -  element count (32-bits)
1735   //
1736   //  Output:
1737   //    x10 ==  0  -  success
1738   //    x10 == -1^K - failure, where K is partial transfer count
1739   //
1740   address generate_generic_copy(const char* name,
1741                                 address byte_copy_entry, address short_copy_entry,
1742                                 address int_copy_entry, address oop_copy_entry,
1743                                 address long_copy_entry, address checkcast_copy_entry) {
1744     assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL &&
1745                 int_copy_entry != NULL && oop_copy_entry != NULL &&
1746                 long_copy_entry != NULL && checkcast_copy_entry != NULL);
1747     Label L_failed, L_failed_0, L_objArray;
1748     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1749 
1750     // Input registers
1751     const Register src        = c_rarg0;  // source array oop
1752     const Register src_pos    = c_rarg1;  // source position
1753     const Register dst        = c_rarg2;  // destination array oop
1754     const Register dst_pos    = c_rarg3;  // destination position
1755     const Register length     = c_rarg4;
1756 
1757     // Registers used as temps
1758     const Register dst_klass = c_rarg5;
1759 
1760     __ align(CodeEntryAlignment);
1761 
1762     StubCodeMark mark(this, "StubRoutines", name);
1763 
1764     address start = __ pc();
1765 
1766     __ enter(); // required for proper stackwalking of RuntimeStub frame
1767 
1768     // bump this on entry, not on exit:
1769     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1770 
1771     //-----------------------------------------------------------------------
1772     // Assembler stub will be used for this call to arraycopy
1773     // if the following conditions are met:
1774     //
1775     // (1) src and dst must not be null.
1776     // (2) src_pos must not be negative.
1777     // (3) dst_pos must not be negative.
1778     // (4) length  must not be negative.
1779     // (5) src klass and dst klass should be the same and not NULL.
1780     // (6) src and dst should be arrays.
1781     // (7) src_pos + length must not exceed length of src.
1782     // (8) dst_pos + length must not exceed length of dst.
1783     //
1784 
1785     // if [src == NULL] then return -1
1786     __ beqz(src, L_failed);
1787 
1788     // if [src_pos < 0] then return -1
1789     // i.e. sign bit set
1790     __ andi(t0, src_pos, 1UL << 31);
1791     __ bnez(t0, L_failed);
1792 
1793     // if [dst == NULL] then return -1
1794     __ beqz(dst, L_failed);
1795 
1796     // if [dst_pos < 0] then return -1
1797     // i.e. sign bit set
1798     __ andi(t0, dst_pos, 1UL << 31);
1799     __ bnez(t0, L_failed);
1800 
1801     // registers used as temp
1802     const Register scratch_length    = x28; // elements count to copy
1803     const Register scratch_src_klass = x29; // array klass
1804     const Register lh                = x30; // layout helper
1805 
1806     // if [length < 0] then return -1
1807     __ addw(scratch_length, length, zr);    // length (elements count, 32-bits value)
1808     // i.e. sign bit set
1809     __ andi(t0, scratch_length, 1UL << 31);
1810     __ bnez(t0, L_failed);
1811 
1812     __ load_klass(scratch_src_klass, src);
1813 #ifdef ASSERT
1814     {
1815       BLOCK_COMMENT("assert klasses not null {");
1816       Label L1, L2;
1817       __ bnez(scratch_src_klass, L2);   // it is broken if klass is NULL
1818       __ bind(L1);
1819       __ stop("broken null klass");
1820       __ bind(L2);
1821       __ load_klass(t0, dst, t1);
1822       __ beqz(t0, L1);     // this would be broken also
1823       BLOCK_COMMENT("} assert klasses not null done");
1824     }
1825 #endif
1826 
1827     // Load layout helper (32-bits)
1828     //
1829     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1830     // 32        30    24            16              8     2                 0
1831     //
1832     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1833     //
1834 
1835     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1836 
1837     // Handle objArrays completely differently...
1838     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1839     __ lw(lh, Address(scratch_src_klass, lh_offset));
1840     __ mvw(t0, objArray_lh);
1841     __ beq(lh, t0, L_objArray);
1842 
1843     // if [src->klass() != dst->klass()] then return -1
1844     __ load_klass(t1, dst);
1845     __ bne(t1, scratch_src_klass, L_failed);
1846 
1847     // if [src->is_Array() != NULL] then return -1
1848     // i.e. (lh >= 0)
1849     __ andi(t0, lh, 1UL << 31);
1850     __ beqz(t0, L_failed);
1851 
1852     // At this point, it is known to be a typeArray (array_tag 0x3).
1853 #ifdef ASSERT
1854     {
1855       BLOCK_COMMENT("assert primitive array {");
1856       Label L;
1857       __ mvw(t1, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
1858       __ bge(lh, t1, L);
1859       __ stop("must be a primitive array");
1860       __ bind(L);
1861       BLOCK_COMMENT("} assert primitive array done");
1862     }
1863 #endif
1864 
1865     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1866                            t1, L_failed);
1867 
1868     // TypeArrayKlass
1869     //
1870     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1871     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1872     //
1873 
1874     const Register t0_offset = t0;    // array offset
1875     const Register x22_elsize = lh;   // element size
1876 
1877     // Get array_header_in_bytes()
1878     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1879     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1880     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1881     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1882 
1883     __ add(src, src, t0_offset);           // src array offset
1884     __ add(dst, dst, t0_offset);           // dst array offset
1885     BLOCK_COMMENT("choose copy loop based on element size");
1886 
1887     // next registers should be set before the jump to corresponding stub
1888     const Register from     = c_rarg0;  // source array address
1889     const Register to       = c_rarg1;  // destination array address
1890     const Register count    = c_rarg2;  // elements count
1891 
1892     // 'from', 'to', 'count' registers should be set in such order
1893     // since they are the same as 'src', 'src_pos', 'dst'.
1894 
1895     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1896 
1897     // The possible values of elsize are 0-3, i.e. exact_log2(element
1898     // size in bytes).  We do a simple bitwise binary search.
1899   __ BIND(L_copy_bytes);
1900     __ andi(t0, x22_elsize, 2);
1901     __ bnez(t0, L_copy_ints);
1902     __ andi(t0, x22_elsize, 1);
1903     __ bnez(t0, L_copy_shorts);
1904     __ add(from, src, src_pos); // src_addr
1905     __ add(to, dst, dst_pos); // dst_addr
1906     __ addw(count, scratch_length, zr); // length
1907     __ j(RuntimeAddress(byte_copy_entry));
1908 
1909   __ BIND(L_copy_shorts);
1910     __ shadd(from, src_pos, src, t0, 1); // src_addr
1911     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1912     __ addw(count, scratch_length, zr); // length
1913     __ j(RuntimeAddress(short_copy_entry));
1914 
1915   __ BIND(L_copy_ints);
1916     __ andi(t0, x22_elsize, 1);
1917     __ bnez(t0, L_copy_longs);
1918     __ shadd(from, src_pos, src, t0, 2); // src_addr
1919     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
1920     __ addw(count, scratch_length, zr); // length
1921     __ j(RuntimeAddress(int_copy_entry));
1922 
1923   __ BIND(L_copy_longs);
1924 #ifdef ASSERT
1925     {
1926       BLOCK_COMMENT("assert long copy {");
1927       Label L;
1928       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x22_elsize
1929       __ addw(lh, lh, zr);
1930       __ mvw(t0, LogBytesPerLong);
1931       __ beq(x22_elsize, t0, L);
1932       __ stop("must be long copy, but elsize is wrong");
1933       __ bind(L);
1934       BLOCK_COMMENT("} assert long copy done");
1935     }
1936 #endif
1937     __ shadd(from, src_pos, src, t0, 3); // src_addr
1938     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
1939     __ addw(count, scratch_length, zr); // length
1940     __ j(RuntimeAddress(long_copy_entry));
1941 
1942     // ObjArrayKlass
1943   __ BIND(L_objArray);
1944     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1945 
1946     Label L_plain_copy, L_checkcast_copy;
1947     // test array classes for subtyping
1948     __ load_klass(t2, dst);
1949     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
1950 
1951     // Identically typed arrays can be copied without element-wise checks.
1952     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1953                            t1, L_failed);
1954 
1955     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1956     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1957     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1958     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1959     __ addw(count, scratch_length, zr); // length
1960   __ BIND(L_plain_copy);
1961     __ j(RuntimeAddress(oop_copy_entry));
1962 
1963   __ BIND(L_checkcast_copy);
1964     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
1965     {
1966       // Before looking at dst.length, make sure dst is also an objArray.
1967       __ lwu(t0, Address(t2, lh_offset));
1968       __ mvw(t1, objArray_lh);
1969       __ bne(t0, t1, L_failed);
1970 
1971       // It is safe to examine both src.length and dst.length.
1972       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1973                              t2, L_failed);
1974 
1975       __ load_klass(dst_klass, dst); // reload
1976 
1977       // Marshal the base address arguments now, freeing registers.
1978       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1979       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1980       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1981       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1982       __ addw(count, length, zr);           // length (reloaded)
1983       const Register sco_temp = c_rarg3;      // this register is free now
1984       assert_different_registers(from, to, count, sco_temp,
1985                                  dst_klass, scratch_src_klass);
1986 
1987       // Generate the type check.
1988       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1989       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1990 
1991       // Smashes t0, t1
1992       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
1993 
1994       // Fetch destination element klass from the ObjArrayKlass header.
1995       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1996       __ ld(dst_klass, Address(dst_klass, ek_offset));
1997       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1998 
1999       // the checkcast_copy loop needs two extra arguments:
2000       assert(c_rarg3 == sco_temp, "#3 already in place");
2001       // Set up arguments for checkcast_copy_entry.
2002       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
2003       __ j(RuntimeAddress(checkcast_copy_entry));
2004     }
2005 
2006   __ BIND(L_failed);
2007     __ li(x10, -1);
2008     __ leave();   // required for proper stackwalking of RuntimeStub frame
2009     __ ret();
2010 
2011     return start;
2012   }
2013 
2014   //
2015   // Generate stub for array fill. If "aligned" is true, the
2016   // "to" address is assumed to be heapword aligned.
2017   //
2018   // Arguments for generated stub:
2019   //   to:    c_rarg0
2020   //   value: c_rarg1
2021   //   count: c_rarg2 treated as signed
2022   //
2023   address generate_fill(BasicType t, bool aligned, const char* name) {
2024     __ align(CodeEntryAlignment);
2025     StubCodeMark mark(this, "StubRoutines", name);
2026     address start = __ pc();
2027 
2028     BLOCK_COMMENT("Entry:");
2029 
2030     const Register to        = c_rarg0;  // source array address
2031     const Register value     = c_rarg1;  // value
2032     const Register count     = c_rarg2;  // elements count
2033 
2034     const Register bz_base   = x28;      // base for block_zero routine
2035     const Register cnt_words = x29;      // temp register
2036     const Register tmp_reg   = t1;
2037 
2038     __ enter();
2039 
2040     Label L_fill_elements, L_exit1;
2041 
2042     int shift = -1;
2043     switch (t) {
2044       case T_BYTE:
2045         shift = 0;
2046 
2047         // Zero extend value
2048         // 8 bit -> 16 bit
2049         __ andi(value, value, 0xff);
2050         __ mv(tmp_reg, value);
2051         __ slli(tmp_reg, tmp_reg, 8);
2052         __ orr(value, value, tmp_reg);
2053 
2054         // 16 bit -> 32 bit
2055         __ mv(tmp_reg, value);
2056         __ slli(tmp_reg, tmp_reg, 16);
2057         __ orr(value, value, tmp_reg);
2058 
2059         __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2060         __ bltu(count, tmp_reg, L_fill_elements);
2061         break;
2062       case T_SHORT:
2063         shift = 1;
2064         // Zero extend value
2065         // 16 bit -> 32 bit
2066         __ andi(value, value, 0xffff);
2067         __ mv(tmp_reg, value);
2068         __ slli(tmp_reg, tmp_reg, 16);
2069         __ orr(value, value, tmp_reg);
2070 
2071         // Short arrays (< 8 bytes) fill by element
2072         __ mv(tmp_reg, 8 >> shift);
2073         __ bltu(count, tmp_reg, L_fill_elements);
2074         break;
2075       case T_INT:
2076         shift = 2;
2077 
2078         // Short arrays (< 8 bytes) fill by element
2079         __ mv(tmp_reg, 8 >> shift);
2080         __ bltu(count, tmp_reg, L_fill_elements);
2081         break;
2082       default: ShouldNotReachHere();
2083     }
2084 
2085     // Align source address at 8 bytes address boundary.
2086     Label L_skip_align1, L_skip_align2, L_skip_align4;
2087     if (!aligned) {
2088       switch (t) {
2089         case T_BYTE:
2090           // One byte misalignment happens only for byte arrays.
2091           __ andi(t0, to, 1);
2092           __ beqz(t0, L_skip_align1);
2093           __ sb(value, Address(to, 0));
2094           __ addi(to, to, 1);
2095           __ addiw(count, count, -1);
2096           __ bind(L_skip_align1);
2097           // Fallthrough
2098         case T_SHORT:
2099           // Two bytes misalignment happens only for byte and short (char) arrays.
2100           __ andi(t0, to, 2);
2101           __ beqz(t0, L_skip_align2);
2102           __ sh(value, Address(to, 0));
2103           __ addi(to, to, 2);
2104           __ addiw(count, count, -(2 >> shift));
2105           __ bind(L_skip_align2);
2106           // Fallthrough
2107         case T_INT:
2108           // Align to 8 bytes, we know we are 4 byte aligned to start.
2109           __ andi(t0, to, 4);
2110           __ beqz(t0, L_skip_align4);
2111           __ sw(value, Address(to, 0));
2112           __ addi(to, to, 4);
2113           __ addiw(count, count, -(4 >> shift));
2114           __ bind(L_skip_align4);
2115           break;
2116         default: ShouldNotReachHere();
2117       }
2118     }
2119 
2120     //
2121     //  Fill large chunks
2122     //
2123     __ srliw(cnt_words, count, 3 - shift); // number of words
2124 
2125     // 32 bit -> 64 bit
2126     __ andi(value, value, 0xffffffff);
2127     __ mv(tmp_reg, value);
2128     __ slli(tmp_reg, tmp_reg, 32);
2129     __ orr(value, value, tmp_reg);
2130 
2131     __ slli(tmp_reg, cnt_words, 3 - shift);
2132     __ subw(count, count, tmp_reg);
2133     {
2134       __ fill_words(to, cnt_words, value);
2135     }
2136 
2137     // Remaining count is less than 8 bytes. Fill it by a single store.
2138     // Note that the total length is no less than 8 bytes.
2139     if (t == T_BYTE || t == T_SHORT) {
2140       __ beqz(count, L_exit1);
2141       __ shadd(to, count, to, tmp_reg, shift); // points to the end
2142       __ sd(value, Address(to, -8)); // overwrite some elements
2143       __ bind(L_exit1);
2144       __ leave();
2145       __ ret();
2146     }
2147 
2148     // Handle copies less than 8 bytes.
2149     Label L_fill_2, L_fill_4, L_exit2;
2150     __ bind(L_fill_elements);
2151     switch (t) {
2152       case T_BYTE:
2153         __ andi(t0, count, 1);
2154         __ beqz(t0, L_fill_2);
2155         __ sb(value, Address(to, 0));
2156         __ addi(to, to, 1);
2157         __ bind(L_fill_2);
2158         __ andi(t0, count, 2);
2159         __ beqz(t0, L_fill_4);
2160         __ sh(value, Address(to, 0));
2161         __ addi(to, to, 2);
2162         __ bind(L_fill_4);
2163         __ andi(t0, count, 4);
2164         __ beqz(t0, L_exit2);
2165         __ sw(value, Address(to, 0));
2166         break;
2167       case T_SHORT:
2168         __ andi(t0, count, 1);
2169         __ beqz(t0, L_fill_4);
2170         __ sh(value, Address(to, 0));
2171         __ addi(to, to, 2);
2172         __ bind(L_fill_4);
2173         __ andi(t0, count, 2);
2174         __ beqz(t0, L_exit2);
2175         __ sw(value, Address(to, 0));
2176         break;
2177       case T_INT:
2178         __ beqz(count, L_exit2);
2179         __ sw(value, Address(to, 0));
2180         break;
2181       default: ShouldNotReachHere();
2182     }
2183     __ bind(L_exit2);
2184     __ leave();
2185     __ ret();
2186     return start;
2187   }
2188 
2189   void generate_arraycopy_stubs() {
2190     address entry                     = NULL;
2191     address entry_jbyte_arraycopy     = NULL;
2192     address entry_jshort_arraycopy    = NULL;
2193     address entry_jint_arraycopy      = NULL;
2194     address entry_oop_arraycopy       = NULL;
2195     address entry_jlong_arraycopy     = NULL;
2196     address entry_checkcast_arraycopy = NULL;
2197 
2198     generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards);
2199     generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards);
2200 
2201     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2202 
2203     //*** jbyte
2204     // Always need aligned and unaligned versions
2205     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, &entry,
2206                                                                                    "jbyte_disjoint_arraycopy");
2207     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_byte_copy(false, entry,
2208                                                                                    &entry_jbyte_arraycopy,
2209                                                                                    "jbyte_arraycopy");
2210     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, &entry,
2211                                                                                    "arrayof_jbyte_disjoint_arraycopy");
2212     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_byte_copy(true, entry, NULL,
2213                                                                                    "arrayof_jbyte_arraycopy");
2214 
2215     //*** jshort
2216     // Always need aligned and unaligned versions
2217     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2218                                                                                     "jshort_disjoint_arraycopy");
2219     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2220                                                                                     &entry_jshort_arraycopy,
2221                                                                                     "jshort_arraycopy");
2222     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2223                                                                                     "arrayof_jshort_disjoint_arraycopy");
2224     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2225                                                                                     "arrayof_jshort_arraycopy");
2226 
2227     //*** jint
2228     // Aligned versions
2229     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_copy(true, &entry,
2230                                                                                   "arrayof_jint_disjoint_arraycopy");
2231     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2232                                                                                   "arrayof_jint_arraycopy");
2233     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2234     // entry_jint_arraycopy always points to the unaligned version
2235     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_copy(false, &entry,
2236                                                                                   "jint_disjoint_arraycopy");
2237     StubRoutines::_jint_arraycopy                    = generate_conjoint_int_copy(false, entry,
2238                                                                                   &entry_jint_arraycopy,
2239                                                                                   "jint_arraycopy");
2240 
2241     //*** jlong
2242     // It is always aligned
2243     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, &entry,
2244                                                                                    "arrayof_jlong_disjoint_arraycopy");
2245     StubRoutines::_arrayof_jlong_arraycopy           = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2246                                                                                    "arrayof_jlong_arraycopy");
2247     StubRoutines::_jlong_disjoint_arraycopy          = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2248     StubRoutines::_jlong_arraycopy                   = StubRoutines::_arrayof_jlong_arraycopy;
2249 
2250     //*** oops
2251     {
2252       // With compressed oops we need unaligned versions; notice that
2253       // we overwrite entry_oop_arraycopy.
2254       bool aligned = !UseCompressedOops;
2255 
2256       StubRoutines::_arrayof_oop_disjoint_arraycopy
2257         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2258                                      /*dest_uninitialized*/false);
2259       StubRoutines::_arrayof_oop_arraycopy
2260         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2261                                      /*dest_uninitialized*/false);
2262       // Aligned versions without pre-barriers
2263       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2264         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2265                                      /*dest_uninitialized*/true);
2266       StubRoutines::_arrayof_oop_arraycopy_uninit
2267         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2268                                      /*dest_uninitialized*/true);
2269     }
2270 
2271     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2272     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2273     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2274     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2275 
2276     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2277     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2278                                                                         /*dest_uninitialized*/true);
2279 
2280 
2281     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2282                                                               entry_jbyte_arraycopy,
2283                                                               entry_jshort_arraycopy,
2284                                                               entry_jint_arraycopy,
2285                                                               entry_jlong_arraycopy);
2286 
2287     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2288                                                                entry_jbyte_arraycopy,
2289                                                                entry_jshort_arraycopy,
2290                                                                entry_jint_arraycopy,
2291                                                                entry_oop_arraycopy,
2292                                                                entry_jlong_arraycopy,
2293                                                                entry_checkcast_arraycopy);
2294 
2295     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2296     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2297     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2298     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2299     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2300     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2301   }
2302 
2303   // Safefetch stubs.
2304   void generate_safefetch(const char* name, int size, address* entry,
2305                           address* fault_pc, address* continuation_pc) {
2306     // safefetch signatures:
2307     //   int      SafeFetch32(int*      adr, int      errValue)
2308     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue)
2309     //
2310     // arguments:
2311     //   c_rarg0 = adr
2312     //   c_rarg1 = errValue
2313     //
2314     // result:
2315     //   PPC_RET  = *adr or errValue
2316     assert_cond(entry != NULL && fault_pc != NULL && continuation_pc != NULL);
2317     StubCodeMark mark(this, "StubRoutines", name);
2318 
2319     // Entry point, pc or function descriptor.
2320     *entry = __ pc();
2321 
2322     // Load *adr into c_rarg1, may fault.
2323     *fault_pc = __ pc();
2324     switch (size) {
2325       case 4:
2326         // int32_t
2327         __ lw(c_rarg1, Address(c_rarg0, 0));
2328         break;
2329       case 8:
2330         // int64_t
2331         __ ld(c_rarg1, Address(c_rarg0, 0));
2332         break;
2333       default:
2334         ShouldNotReachHere();
2335     }
2336 
2337     // return errValue or *adr
2338     *continuation_pc = __ pc();
2339     __ mv(x10, c_rarg1);
2340     __ ret();
2341   }
2342 
2343   // code for comparing 16 bytes of strings with same encoding
2344   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
2345     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31;
2346     __ ld(tmp5, Address(str1));
2347     __ addi(str1, str1, 8);
2348     __ xorr(tmp4, tmp1, tmp2);
2349     __ ld(cnt1, Address(str2));
2350     __ addi(str2, str2, 8);
2351     __ bnez(tmp4, DIFF1);
2352     __ ld(tmp1, Address(str1));
2353     __ addi(str1, str1, 8);
2354     __ xorr(tmp4, tmp5, cnt1);
2355     __ ld(tmp2, Address(str2));
2356     __ addi(str2, str2, 8);
2357     __ bnez(tmp4, DIFF2);
2358   }
2359 
2360   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
2361   void compare_string_8_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
2362                               Label &DIFF2) {
2363     const Register strU = x12, curU = x7, strL = x29, tmp = x30;
2364     __ ld(tmpL, Address(strL));
2365     __ addi(strL, strL, 8);
2366     __ ld(tmpU, Address(strU));
2367     __ addi(strU, strU, 8);
2368     __ inflate_lo32(tmp, tmpL);
2369     __ mv(t0, tmp);
2370     __ xorr(tmp, curU, t0);
2371     __ bnez(tmp, DIFF2);
2372 
2373     __ ld(curU, Address(strU));
2374     __ addi(strU, strU, 8);
2375     __ inflate_hi32(tmp, tmpL);
2376     __ mv(t0, tmp);
2377     __ xorr(tmp, tmpU, t0);
2378     __ bnez(tmp, DIFF1);
2379   }
2380 
2381   // x10  = result
2382   // x11  = str1
2383   // x12  = cnt1
2384   // x13  = str2
2385   // x14  = cnt2
2386   // x28  = tmp1
2387   // x29  = tmp2
2388   // x30  = tmp3
2389   address generate_compare_long_string_different_encoding(bool isLU) {
2390     __ align(CodeEntryAlignment);
2391     StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL");
2392     address entry = __ pc();
2393     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
2394           DONE, CALCULATE_DIFFERENCE;
2395     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2396                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2397     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2398 
2399     // cnt2 == amount of characters left to compare
2400     // Check already loaded first 4 symbols
2401     __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2);
2402     __ mv(isLU ? tmp1 : tmp2, tmp3);
2403     __ addi(str1, str1, isLU ? wordSize / 2 : wordSize);
2404     __ addi(str2, str2, isLU ? wordSize : wordSize / 2);
2405     __ sub(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
2406     __ push_reg(spilled_regs, sp);
2407 
2408     if (isLU) {
2409       __ add(str1, str1, cnt2);
2410       __ shadd(str2, cnt2, str2, t0, 1);
2411     } else {
2412       __ shadd(str1, cnt2, str1, t0, 1);
2413       __ add(str2, str2, cnt2);
2414     }
2415     __ xorr(tmp3, tmp1, tmp2);
2416     __ mv(tmp5, tmp2);
2417     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2418 
2419     Register strU = isLU ? str2 : str1,
2420              strL = isLU ? str1 : str2,
2421              tmpU = isLU ? tmp5 : tmp1, // where to keep U for comparison
2422              tmpL = isLU ? tmp1 : tmp5; // where to keep L for comparison
2423 
2424     __ sub(tmp2, strL, cnt2); // strL pointer to load from
2425     __ slli(t0, cnt2, 1);
2426     __ sub(cnt1, strU, t0); // strU pointer to load from
2427 
2428     __ ld(tmp4, Address(cnt1));
2429     __ addi(cnt1, cnt1, 8);
2430     __ beqz(cnt2, LOAD_LAST); // no characters left except last load
2431     __ sub(cnt2, cnt2, 16);
2432     __ bltz(cnt2, TAIL);
2433     __ bind(SMALL_LOOP); // smaller loop
2434       __ sub(cnt2, cnt2, 16);
2435       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2436       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2437       __ bgez(cnt2, SMALL_LOOP);
2438       __ addi(t0, cnt2, 16);
2439       __ beqz(t0, LOAD_LAST);
2440     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
2441       // Address of 8 bytes before last 4 characters in UTF-16 string
2442       __ shadd(cnt1, cnt2, cnt1, t0, 1);
2443       // Address of 16 bytes before last 4 characters in Latin1 string
2444       __ add(tmp2, tmp2, cnt2);
2445       __ ld(tmp4, Address(cnt1, -8));
2446       // last 16 characters before last load
2447       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2448       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2449       __ j(LOAD_LAST);
2450     __ bind(DIFF2);
2451       __ mv(tmpU, tmp4);
2452     __ bind(DIFF1);
2453       __ mv(tmpL, t0);
2454       __ j(CALCULATE_DIFFERENCE);
2455     __ bind(LOAD_LAST);
2456       // Last 4 UTF-16 characters are already pre-loaded into tmp4 by compare_string_8_x_LU.
2457       // No need to load it again
2458       __ mv(tmpU, tmp4);
2459       __ ld(tmpL, Address(strL));
2460       __ inflate_lo32(tmp3, tmpL);
2461       __ mv(tmpL, tmp3);
2462       __ xorr(tmp3, tmpU, tmpL);
2463       __ beqz(tmp3, DONE);
2464 
2465       // Find the first different characters in the longwords and
2466       // compute their difference.
2467     __ bind(CALCULATE_DIFFERENCE);
2468       __ ctzc_bit(tmp4, tmp3);
2469       __ srl(tmp1, tmp1, tmp4);
2470       __ srl(tmp5, tmp5, tmp4);
2471       __ andi(tmp1, tmp1, 0xFFFF);
2472       __ andi(tmp5, tmp5, 0xFFFF);
2473       __ sub(result, tmp1, tmp5);
2474     __ bind(DONE);
2475       __ pop_reg(spilled_regs, sp);
2476       __ ret();
2477     return entry;
2478   }
2479 
2480   address generate_method_entry_barrier() {
2481     __ align(CodeEntryAlignment);
2482     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
2483 
2484     Label deoptimize_label;
2485 
2486     address start = __ pc();
2487 
2488     __ set_last_Java_frame(sp, fp, ra, t0);
2489 
2490     __ enter();
2491     __ add(t1, sp, wordSize);
2492 
2493     __ sub(sp, sp, 4 * wordSize);
2494 
2495     __ push_call_clobbered_registers();
2496 
2497     __ mv(c_rarg0, t1);
2498     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
2499 
2500     __ reset_last_Java_frame(true);
2501 
2502     __ mv(t0, x10);
2503 
2504     __ pop_call_clobbered_registers();
2505 
2506     __ bnez(t0, deoptimize_label);
2507 
2508     __ leave();
2509     __ ret();
2510 
2511     __ BIND(deoptimize_label);
2512 
2513     __ ld(t0, Address(sp, 0));
2514     __ ld(fp, Address(sp, wordSize));
2515     __ ld(ra, Address(sp, wordSize * 2));
2516     __ ld(t1, Address(sp, wordSize * 3));
2517 
2518     __ mv(sp, t0);
2519     __ jr(t1);
2520 
2521     return start;
2522   }
2523 
2524   // x10  = result
2525   // x11  = str1
2526   // x12  = cnt1
2527   // x13  = str2
2528   // x14  = cnt2
2529   // x28  = tmp1
2530   // x29  = tmp2
2531   // x30  = tmp3
2532   // x31  = tmp4
2533   address generate_compare_long_string_same_encoding(bool isLL) {
2534     __ align(CodeEntryAlignment);
2535     StubCodeMark mark(this, "StubRoutines", isLL ?
2536                       "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU");
2537     address entry = __ pc();
2538     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
2539           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
2540     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2541                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2542     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2543 
2544     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
2545     // update cnt2 counter with already loaded 8 bytes
2546     __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
2547     // update pointers, because of previous read
2548     __ add(str1, str1, wordSize);
2549     __ add(str2, str2, wordSize);
2550     // less than 16 bytes left?
2551     __ sub(cnt2, cnt2, isLL ? 16 : 8);
2552     __ push_reg(spilled_regs, sp);
2553     __ bltz(cnt2, TAIL);
2554     __ bind(SMALL_LOOP);
2555       compare_string_16_bytes_same(DIFF, DIFF2);
2556       __ sub(cnt2, cnt2, isLL ? 16 : 8);
2557       __ bgez(cnt2, SMALL_LOOP);
2558     __ bind(TAIL);
2559       __ addi(cnt2, cnt2, isLL ? 16 : 8);
2560       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
2561       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2562       __ blez(cnt2, CHECK_LAST);
2563       __ xorr(tmp4, tmp1, tmp2);
2564       __ bnez(tmp4, DIFF);
2565       __ ld(tmp1, Address(str1));
2566       __ addi(str1, str1, 8);
2567       __ ld(tmp2, Address(str2));
2568       __ addi(str2, str2, 8);
2569       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2570     __ bind(CHECK_LAST);
2571       if (!isLL) {
2572         __ add(cnt2, cnt2, cnt2); // now in bytes
2573       }
2574       __ xorr(tmp4, tmp1, tmp2);
2575       __ bnez(tmp4, DIFF);
2576       __ add(str1, str1, cnt2);
2577       __ ld(tmp5, Address(str1));
2578       __ add(str2, str2, cnt2);
2579       __ ld(cnt1, Address(str2));
2580       __ xorr(tmp4, tmp5, cnt1);
2581       __ beqz(tmp4, LENGTH_DIFF);
2582       // Find the first different characters in the longwords and
2583       // compute their difference.
2584     __ bind(DIFF2);
2585       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2586       __ srl(tmp5, tmp5, tmp3);
2587       __ srl(cnt1, cnt1, tmp3);
2588       if (isLL) {
2589         __ andi(tmp5, tmp5, 0xFF);
2590         __ andi(cnt1, cnt1, 0xFF);
2591       } else {
2592         __ andi(tmp5, tmp5, 0xFFFF);
2593         __ andi(cnt1, cnt1, 0xFFFF);
2594       }
2595       __ sub(result, tmp5, cnt1);
2596       __ j(LENGTH_DIFF);
2597     __ bind(DIFF);
2598       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2599       __ srl(tmp1, tmp1, tmp3);
2600       __ srl(tmp2, tmp2, tmp3);
2601       if (isLL) {
2602         __ andi(tmp1, tmp1, 0xFF);
2603         __ andi(tmp2, tmp2, 0xFF);
2604       } else {
2605         __ andi(tmp1, tmp1, 0xFFFF);
2606         __ andi(tmp2, tmp2, 0xFFFF);
2607       }
2608       __ sub(result, tmp1, tmp2);
2609       __ j(LENGTH_DIFF);
2610     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
2611       __ xorr(tmp4, tmp1, tmp2);
2612       __ bnez(tmp4, DIFF);
2613     __ bind(LENGTH_DIFF);
2614       __ pop_reg(spilled_regs, sp);
2615       __ ret();
2616     return entry;
2617   }
2618 
2619   void generate_compare_long_strings() {
2620     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true);
2621     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false);
2622     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true);
2623     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false);
2624   }
2625 
2626   // x10 result
2627   // x11 src
2628   // x12 src count
2629   // x13 pattern
2630   // x14 pattern count
2631   address generate_string_indexof_linear(bool needle_isL, bool haystack_isL)
2632   {
2633     const char* stubName = needle_isL
2634            ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul")
2635            : "indexof_linear_uu";
2636     __ align(CodeEntryAlignment);
2637     StubCodeMark mark(this, "StubRoutines", stubName);
2638     address entry = __ pc();
2639 
2640     int needle_chr_size = needle_isL ? 1 : 2;
2641     int haystack_chr_size = haystack_isL ? 1 : 2;
2642     int needle_chr_shift = needle_isL ? 0 : 1;
2643     int haystack_chr_shift = haystack_isL ? 0 : 1;
2644     bool isL = needle_isL && haystack_isL;
2645     // parameters
2646     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
2647     // temporary registers
2648     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
2649     // redefinitions
2650     Register ch1 = x28, ch2 = x29;
2651     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
2652 
2653     __ push_reg(spilled_regs, sp);
2654 
2655     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
2656           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
2657           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
2658           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
2659           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
2660           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
2661 
2662     __ ld(ch1, Address(needle));
2663     __ ld(ch2, Address(haystack));
2664     // src.length - pattern.length
2665     __ sub(haystack_len, haystack_len, needle_len);
2666 
2667     // first is needle[0]
2668     __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first);
2669     uint64_t mask0101 = UCONST64(0x0101010101010101);
2670     uint64_t mask0001 = UCONST64(0x0001000100010001);
2671     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
2672     __ mul(first, first, mask1);
2673     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
2674     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
2675     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
2676     if (needle_isL != haystack_isL) {
2677       __ mv(tmp, ch1);
2678     }
2679     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
2680     __ blez(haystack_len, L_SMALL);
2681 
2682     if (needle_isL != haystack_isL) {
2683       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2684     }
2685     // xorr, sub, orr, notr, andr
2686     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
2687     // eg:
2688     // first:        aa aa aa aa aa aa aa aa
2689     // ch2:          aa aa li nx jd ka aa aa
2690     // match_mask:   80 80 00 00 00 00 80 80
2691     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2692 
2693     // search first char of needle, if success, goto L_HAS_ZERO;
2694     __ bnez(match_mask, L_HAS_ZERO);
2695     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2696     __ add(result, result, wordSize / haystack_chr_size);
2697     __ add(haystack, haystack, wordSize);
2698     __ bltz(haystack_len, L_POST_LOOP);
2699 
2700     __ bind(L_LOOP);
2701     __ ld(ch2, Address(haystack));
2702     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2703     __ bnez(match_mask, L_HAS_ZERO);
2704 
2705     __ bind(L_LOOP_PROCEED);
2706     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2707     __ add(haystack, haystack, wordSize);
2708     __ add(result, result, wordSize / haystack_chr_size);
2709     __ bgez(haystack_len, L_LOOP);
2710 
2711     __ bind(L_POST_LOOP);
2712     __ mv(ch2, -wordSize / haystack_chr_size);
2713     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
2714     __ ld(ch2, Address(haystack));
2715     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2716     __ neg(haystack_len, haystack_len);
2717     __ xorr(ch2, first, ch2);
2718     __ sub(match_mask, ch2, mask1);
2719     __ orr(ch2, ch2, mask2);
2720     __ mv(trailing_zeros, -1); // all bits set
2721     __ j(L_SMALL_PROCEED);
2722 
2723     __ align(OptoLoopAlignment);
2724     __ bind(L_SMALL);
2725     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2726     __ neg(haystack_len, haystack_len);
2727     if (needle_isL != haystack_isL) {
2728       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2729     }
2730     __ xorr(ch2, first, ch2);
2731     __ sub(match_mask, ch2, mask1);
2732     __ orr(ch2, ch2, mask2);
2733     __ mv(trailing_zeros, -1); // all bits set
2734 
2735     __ bind(L_SMALL_PROCEED);
2736     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
2737     __ notr(ch2, ch2);
2738     __ andr(match_mask, match_mask, ch2);
2739     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
2740     __ beqz(match_mask, NOMATCH);
2741 
2742     __ bind(L_SMALL_HAS_ZERO_LOOP);
2743     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros
2744     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2745     __ mv(ch2, wordSize / haystack_chr_size);
2746     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
2747     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2748     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2749     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2750 
2751     __ bind(L_SMALL_CMP_LOOP);
2752     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
2753     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2754     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
2755     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2756     __ add(trailing_zeros, trailing_zeros, 1);
2757     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
2758     __ beq(first, ch2, L_SMALL_CMP_LOOP);
2759 
2760     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
2761     __ beqz(match_mask, NOMATCH);
2762     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2763     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2764     __ add(result, result, 1);
2765     __ add(haystack, haystack, haystack_chr_size);
2766     __ j(L_SMALL_HAS_ZERO_LOOP);
2767 
2768     __ align(OptoLoopAlignment);
2769     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
2770     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2771     __ j(DONE);
2772 
2773     __ align(OptoLoopAlignment);
2774     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
2775     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2776     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2777     __ j(DONE);
2778 
2779     __ align(OptoLoopAlignment);
2780     __ bind(L_HAS_ZERO);
2781     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2782     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2783     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
2784     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
2785     __ sub(result, result, 1); // array index from 0, so result -= 1
2786 
2787     __ bind(L_HAS_ZERO_LOOP);
2788     __ mv(needle_len, wordSize / haystack_chr_size);
2789     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
2790     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
2791     // load next 8 bytes from haystack, and increase result index
2792     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2793     __ add(result, result, 1);
2794     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2795     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2796 
2797     // compare one char
2798     __ bind(L_CMP_LOOP);
2799     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
2800     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
2801     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2802     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2803     __ add(trailing_zeros, trailing_zeros, 1); // next char index
2804     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
2805     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
2806     __ beq(needle_len, ch2, L_CMP_LOOP);
2807 
2808     __ bind(L_CMP_LOOP_NOMATCH);
2809     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
2810     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index
2811     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2812     __ add(haystack, haystack, haystack_chr_size);
2813     __ j(L_HAS_ZERO_LOOP);
2814 
2815     __ align(OptoLoopAlignment);
2816     __ bind(L_CMP_LOOP_LAST_CMP);
2817     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
2818     __ j(DONE);
2819 
2820     __ align(OptoLoopAlignment);
2821     __ bind(L_CMP_LOOP_LAST_CMP2);
2822     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2823     __ add(result, result, 1);
2824     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2825     __ j(DONE);
2826 
2827     __ align(OptoLoopAlignment);
2828     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
2829     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
2830     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
2831     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
2832     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
2833     // result by analyzed characters value, so, we can just reset lower bits
2834     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
2835     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
2836     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
2837     // index of last analyzed substring inside current octet. So, haystack in at
2838     // respective start address. We need to advance it to next octet
2839     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
2840     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
2841     __ andi(result, result, haystack_isL ? -8 : -4);
2842     __ slli(tmp, match_mask, haystack_chr_shift);
2843     __ sub(haystack, haystack, tmp);
2844     __ addw(haystack_len, haystack_len, zr);
2845     __ j(L_LOOP_PROCEED);
2846 
2847     __ align(OptoLoopAlignment);
2848     __ bind(NOMATCH);
2849     __ mv(result, -1);
2850 
2851     __ bind(DONE);
2852     __ pop_reg(spilled_regs, sp);
2853     __ ret();
2854     return entry;
2855   }
2856 
2857   void generate_string_indexof_stubs()
2858   {
2859     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
2860     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
2861     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
2862   }
2863 
2864 #ifdef COMPILER2
2865   address generate_mulAdd()
2866   {
2867     __ align(CodeEntryAlignment);
2868     StubCodeMark mark(this, "StubRoutines", "mulAdd");
2869 
2870     address entry = __ pc();
2871 
2872     const Register out     = x10;
2873     const Register in      = x11;
2874     const Register offset  = x12;
2875     const Register len     = x13;
2876     const Register k       = x14;
2877     const Register tmp     = x28;
2878 
2879     BLOCK_COMMENT("Entry:");
2880     __ enter();
2881     __ mul_add(out, in, offset, len, k, tmp);
2882     __ leave();
2883     __ ret();
2884 
2885     return entry;
2886   }
2887 
2888   /**
2889    *  Arguments:
2890    *
2891    *  Input:
2892    *    c_rarg0   - x address
2893    *    c_rarg1   - x length
2894    *    c_rarg2   - y address
2895    *    c_rarg3   - y length
2896    *    c_rarg4   - z address
2897    *    c_rarg5   - z length
2898    */
2899   address generate_multiplyToLen()
2900   {
2901     __ align(CodeEntryAlignment);
2902     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2903     address entry = __ pc();
2904 
2905     const Register x     = x10;
2906     const Register xlen  = x11;
2907     const Register y     = x12;
2908     const Register ylen  = x13;
2909     const Register z     = x14;
2910     const Register zlen  = x15;
2911 
2912     const Register tmp1  = x16;
2913     const Register tmp2  = x17;
2914     const Register tmp3  = x7;
2915     const Register tmp4  = x28;
2916     const Register tmp5  = x29;
2917     const Register tmp6  = x30;
2918     const Register tmp7  = x31;
2919 
2920     BLOCK_COMMENT("Entry:");
2921     __ enter(); // required for proper stackwalking of RuntimeStub frame
2922     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2923     __ leave(); // required for proper stackwalking of RuntimeStub frame
2924     __ ret();
2925 
2926     return entry;
2927   }
2928 
2929   address generate_squareToLen()
2930   {
2931     __ align(CodeEntryAlignment);
2932     StubCodeMark mark(this, "StubRoutines", "squareToLen");
2933     address entry = __ pc();
2934 
2935     const Register x     = x10;
2936     const Register xlen  = x11;
2937     const Register z     = x12;
2938     const Register zlen  = x13;
2939     const Register y     = x14; // == x
2940     const Register ylen  = x15; // == xlen
2941 
2942     const Register tmp1  = x16;
2943     const Register tmp2  = x17;
2944     const Register tmp3  = x7;
2945     const Register tmp4  = x28;
2946     const Register tmp5  = x29;
2947     const Register tmp6  = x30;
2948     const Register tmp7  = x31;
2949 
2950     BLOCK_COMMENT("Entry:");
2951     __ enter();
2952     __ mv(y, x);
2953     __ mv(ylen, xlen);
2954     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2955     __ leave();
2956     __ ret();
2957 
2958     return entry;
2959   }
2960 
2961   // Arguments:
2962   //
2963   // Input:
2964   //   c_rarg0   - newArr address
2965   //   c_rarg1   - oldArr address
2966   //   c_rarg2   - newIdx
2967   //   c_rarg3   - shiftCount
2968   //   c_rarg4   - numIter
2969   //
2970   address generate_bigIntegerLeftShift() {
2971     __ align(CodeEntryAlignment);
2972     StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
2973     address entry = __ pc();
2974 
2975     Label loop, exit;
2976 
2977     Register newArr        = c_rarg0;
2978     Register oldArr        = c_rarg1;
2979     Register newIdx        = c_rarg2;
2980     Register shiftCount    = c_rarg3;
2981     Register numIter       = c_rarg4;
2982 
2983     Register shiftRevCount = c_rarg5;
2984     Register oldArrNext    = t1;
2985 
2986     __ beqz(numIter, exit);
2987     __ shadd(newArr, newIdx, newArr, t0, 2);
2988 
2989     __ li(shiftRevCount, 32);
2990     __ sub(shiftRevCount, shiftRevCount, shiftCount);
2991 
2992     __ bind(loop);
2993     __ addi(oldArrNext, oldArr, 4);
2994     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
2995     __ vle32_v(v0, oldArr);
2996     __ vle32_v(v4, oldArrNext);
2997     __ vsll_vx(v0, v0, shiftCount);
2998     __ vsrl_vx(v4, v4, shiftRevCount);
2999     __ vor_vv(v0, v0, v4);
3000     __ vse32_v(v0, newArr);
3001     __ sub(numIter, numIter, t0);
3002     __ shadd(oldArr, t0, oldArr, t1, 2);
3003     __ shadd(newArr, t0, newArr, t1, 2);
3004     __ bnez(numIter, loop);
3005 
3006     __ bind(exit);
3007     __ ret();
3008 
3009     return entry;
3010   }
3011 
3012   // Arguments:
3013   //
3014   // Input:
3015   //   c_rarg0   - newArr address
3016   //   c_rarg1   - oldArr address
3017   //   c_rarg2   - newIdx
3018   //   c_rarg3   - shiftCount
3019   //   c_rarg4   - numIter
3020   //
3021   address generate_bigIntegerRightShift() {
3022     __ align(CodeEntryAlignment);
3023     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
3024     address entry = __ pc();
3025 
3026     Label loop, exit;
3027 
3028     Register newArr        = c_rarg0;
3029     Register oldArr        = c_rarg1;
3030     Register newIdx        = c_rarg2;
3031     Register shiftCount    = c_rarg3;
3032     Register numIter       = c_rarg4;
3033     Register idx           = numIter;
3034 
3035     Register shiftRevCount = c_rarg5;
3036     Register oldArrNext    = c_rarg6;
3037     Register newArrCur     = t0;
3038     Register oldArrCur     = t1;
3039 
3040     __ beqz(idx, exit);
3041     __ shadd(newArr, newIdx, newArr, t0, 2);
3042 
3043     __ li(shiftRevCount, 32);
3044     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3045 
3046     __ bind(loop);
3047     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3048     __ sub(idx, idx, t0);
3049     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3050     __ shadd(newArrCur, idx, newArr, t1, 2);
3051     __ addi(oldArrCur, oldArrNext, 4);
3052     __ vle32_v(v0, oldArrCur);
3053     __ vle32_v(v4, oldArrNext);
3054     __ vsrl_vx(v0, v0, shiftCount);
3055     __ vsll_vx(v4, v4, shiftRevCount);
3056     __ vor_vv(v0, v0, v4);
3057     __ vse32_v(v0, newArrCur);
3058     __ bnez(idx, loop);
3059 
3060     __ bind(exit);
3061     __ ret();
3062 
3063     return entry;
3064   }
3065 #endif
3066 
3067 #ifdef COMPILER2
3068   class MontgomeryMultiplyGenerator : public MacroAssembler {
3069 
3070     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3071       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3072 
3073     RegSet _toSave;
3074     bool _squaring;
3075 
3076   public:
3077     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3078       : MacroAssembler(as->code()), _squaring(squaring) {
3079 
3080       // Register allocation
3081 
3082       Register reg = c_rarg0;
3083       Pa_base = reg;       // Argument registers
3084       if (squaring) {
3085         Pb_base = Pa_base;
3086       } else {
3087         Pb_base = ++reg;
3088       }
3089       Pn_base = ++reg;
3090       Rlen= ++reg;
3091       inv = ++reg;
3092       Pm_base = ++reg;
3093 
3094                         // Working registers:
3095       Ra =  ++reg;      // The current digit of a, b, n, and m.
3096       Rb =  ++reg;
3097       Rm =  ++reg;
3098       Rn =  ++reg;
3099 
3100       Pa =  ++reg;      // Pointers to the current/next digit of a, b, n, and m.
3101       Pb =  ++reg;
3102       Pm =  ++reg;
3103       Pn =  ++reg;
3104 
3105       tmp0 =  ++reg;    // Three registers which form a
3106       tmp1 =  ++reg;    // triple-precision accumuator.
3107       tmp2 =  ++reg;
3108 
3109       Ri =  x6;         // Inner and outer loop indexes.
3110       Rj =  x7;
3111 
3112       Rhi_ab = x28;     // Product registers: low and high parts
3113       Rlo_ab = x29;     // of a*b and m*n.
3114       Rhi_mn = x30;
3115       Rlo_mn = x31;
3116 
3117       // x18 and up are callee-saved.
3118       _toSave = RegSet::range(x18, reg) + Pm_base;
3119     }
3120 
3121   private:
3122     void save_regs() {
3123       push_reg(_toSave, sp);
3124     }
3125 
3126     void restore_regs() {
3127       pop_reg(_toSave, sp);
3128     }
3129 
3130     template <typename T>
3131     void unroll_2(Register count, T block) {
3132       Label loop, end, odd;
3133       beqz(count, end);
3134       andi(t0, count, 0x1);
3135       bnez(t0, odd);
3136       align(16);
3137       bind(loop);
3138       (this->*block)();
3139       bind(odd);
3140       (this->*block)();
3141       addi(count, count, -2);
3142       bgtz(count, loop);
3143       bind(end);
3144     }
3145 
3146     template <typename T>
3147     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3148       Label loop, end, odd;
3149       beqz(count, end);
3150       andi(tmp, count, 0x1);
3151       bnez(tmp, odd);
3152       align(16);
3153       bind(loop);
3154       (this->*block)(d, s, tmp);
3155       bind(odd);
3156       (this->*block)(d, s, tmp);
3157       addi(count, count, -2);
3158       bgtz(count, loop);
3159       bind(end);
3160     }
3161 
3162     void pre1(RegisterOrConstant i) {
3163       block_comment("pre1");
3164       // Pa = Pa_base;
3165       // Pb = Pb_base + i;
3166       // Pm = Pm_base;
3167       // Pn = Pn_base + i;
3168       // Ra = *Pa;
3169       // Rb = *Pb;
3170       // Rm = *Pm;
3171       // Rn = *Pn;
3172       if (i.is_register()) {
3173         slli(t0, i.as_register(), LogBytesPerWord);
3174       } else {
3175         mv(t0, i.as_constant());
3176         slli(t0, t0, LogBytesPerWord);
3177       }
3178 
3179       mv(Pa, Pa_base);
3180       add(Pb, Pb_base, t0);
3181       mv(Pm, Pm_base);
3182       add(Pn, Pn_base, t0);
3183 
3184       ld(Ra, Address(Pa));
3185       ld(Rb, Address(Pb));
3186       ld(Rm, Address(Pm));
3187       ld(Rn, Address(Pn));
3188 
3189       // Zero the m*n result.
3190       mv(Rhi_mn, zr);
3191       mv(Rlo_mn, zr);
3192     }
3193 
3194     // The core multiply-accumulate step of a Montgomery
3195     // multiplication.  The idea is to schedule operations as a
3196     // pipeline so that instructions with long latencies (loads and
3197     // multiplies) have time to complete before their results are
3198     // used.  This most benefits in-order implementations of the
3199     // architecture but out-of-order ones also benefit.
3200     void step() {
3201       block_comment("step");
3202       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3203       // Ra = *++Pa;
3204       // Rb = *--Pb;
3205       mulhu(Rhi_ab, Ra, Rb);
3206       mul(Rlo_ab, Ra, Rb);
3207       addi(Pa, Pa, wordSize);
3208       ld(Ra, Address(Pa));
3209       addi(Pb, Pb, -wordSize);
3210       ld(Rb, Address(Pb));
3211       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
3212                                             // previous iteration.
3213       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3214       // Rm = *++Pm;
3215       // Rn = *--Pn;
3216       mulhu(Rhi_mn, Rm, Rn);
3217       mul(Rlo_mn, Rm, Rn);
3218       addi(Pm, Pm, wordSize);
3219       ld(Rm, Address(Pm));
3220       addi(Pn, Pn, -wordSize);
3221       ld(Rn, Address(Pn));
3222       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3223     }
3224 
3225     void post1() {
3226       block_comment("post1");
3227 
3228       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3229       // Ra = *++Pa;
3230       // Rb = *--Pb;
3231       mulhu(Rhi_ab, Ra, Rb);
3232       mul(Rlo_ab, Ra, Rb);
3233       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3234       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3235 
3236       // *Pm = Rm = tmp0 * inv;
3237       mul(Rm, tmp0, inv);
3238       sd(Rm, Address(Pm));
3239 
3240       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3241       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3242       mulhu(Rhi_mn, Rm, Rn);
3243 
3244 #ifndef PRODUCT
3245       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3246       {
3247         mul(Rlo_mn, Rm, Rn);
3248         add(Rlo_mn, tmp0, Rlo_mn);
3249         Label ok;
3250         beqz(Rlo_mn, ok);
3251         stop("broken Montgomery multiply");
3252         bind(ok);
3253       }
3254 #endif
3255       // We have very carefully set things up so that
3256       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3257       // the lower half of Rm * Rn because we know the result already:
3258       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3259       // tmp0 != 0.  So, rather than do a mul and an cad we just set
3260       // the carry flag iff tmp0 is nonzero.
3261       //
3262       // mul(Rlo_mn, Rm, Rn);
3263       // cad(zr, tmp0, Rlo_mn);
3264       addi(t0, tmp0, -1);
3265       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3266       cadc(tmp0, tmp1, Rhi_mn, t0);
3267       adc(tmp1, tmp2, zr, t0);
3268       mv(tmp2, zr);
3269     }
3270 
3271     void pre2(Register i, Register len) {
3272       block_comment("pre2");
3273       // Pa = Pa_base + i-len;
3274       // Pb = Pb_base + len;
3275       // Pm = Pm_base + i-len;
3276       // Pn = Pn_base + len;
3277 
3278       sub(Rj, i, len);
3279       // Rj == i-len
3280 
3281       // Ra as temp register
3282       slli(Ra, Rj, LogBytesPerWord);
3283       add(Pa, Pa_base, Ra);
3284       add(Pm, Pm_base, Ra);
3285       slli(Ra, len, LogBytesPerWord);
3286       add(Pb, Pb_base, Ra);
3287       add(Pn, Pn_base, Ra);
3288 
3289       // Ra = *++Pa;
3290       // Rb = *--Pb;
3291       // Rm = *++Pm;
3292       // Rn = *--Pn;
3293       add(Pa, Pa, wordSize);
3294       ld(Ra, Address(Pa));
3295       add(Pb, Pb, -wordSize);
3296       ld(Rb, Address(Pb));
3297       add(Pm, Pm, wordSize);
3298       ld(Rm, Address(Pm));
3299       add(Pn, Pn, -wordSize);
3300       ld(Rn, Address(Pn));
3301 
3302       mv(Rhi_mn, zr);
3303       mv(Rlo_mn, zr);
3304     }
3305 
3306     void post2(Register i, Register len) {
3307       block_comment("post2");
3308       sub(Rj, i, len);
3309 
3310       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
3311 
3312       // As soon as we know the least significant digit of our result,
3313       // store it.
3314       // Pm_base[i-len] = tmp0;
3315       // Rj as temp register
3316       slli(Rj, Rj, LogBytesPerWord);
3317       add(Rj, Pm_base, Rj);
3318       sd(tmp0, Address(Rj));
3319 
3320       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3321       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
3322       adc(tmp1, tmp2, zr, t0);
3323       mv(tmp2, zr);
3324     }
3325 
3326     // A carry in tmp0 after Montgomery multiplication means that we
3327     // should subtract multiples of n from our result in m.  We'll
3328     // keep doing that until there is no carry.
3329     void normalize(Register len) {
3330       block_comment("normalize");
3331       // while (tmp0)
3332       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
3333       Label loop, post, again;
3334       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
3335       beqz(tmp0, post); {
3336         bind(again); {
3337           mv(i, zr);
3338           mv(cnt, len);
3339           slli(Rn, i, LogBytesPerWord);
3340           add(Rm, Pm_base, Rn);
3341           ld(Rm, Address(Rm));
3342           add(Rn, Pn_base, Rn);
3343           ld(Rn, Address(Rn));
3344           li(t0, 1); // set carry flag, i.e. no borrow
3345           align(16);
3346           bind(loop); {
3347             notr(Rn, Rn);
3348             add(Rm, Rm, t0);
3349             add(Rm, Rm, Rn);
3350             sltu(t0, Rm, Rn);
3351             slli(Rn, i, LogBytesPerWord); // Rn as temp register
3352             add(Rn, Pm_base, Rn);
3353             sd(Rm, Address(Rn));
3354             add(i, i, 1);
3355             slli(Rn, i, LogBytesPerWord);
3356             add(Rm, Pm_base, Rn);
3357             ld(Rm, Address(Rm));
3358             add(Rn, Pn_base, Rn);
3359             ld(Rn, Address(Rn));
3360             sub(cnt, cnt, 1);
3361           } bnez(cnt, loop);
3362           addi(tmp0, tmp0, -1);
3363           add(tmp0, tmp0, t0);
3364         } bnez(tmp0, again);
3365       } bind(post);
3366     }
3367 
3368     // Move memory at s to d, reversing words.
3369     //    Increments d to end of copied memory
3370     //    Destroys tmp1, tmp2
3371     //    Preserves len
3372     //    Leaves s pointing to the address which was in d at start
3373     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3374       assert(tmp1 < x28 && tmp2 < x28, "register corruption");
3375 
3376       slli(tmp1, len, LogBytesPerWord);
3377       add(s, s, tmp1);
3378       mv(tmp1, len);
3379       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3380       slli(tmp1, len, LogBytesPerWord);
3381       sub(s, d, tmp1);
3382     }
3383     // [63...0] -> [31...0][63...32]
3384     void reverse1(Register d, Register s, Register tmp) {
3385       addi(s, s, -wordSize);
3386       ld(tmp, Address(s));
3387       ror_imm(tmp, tmp, 32, t0);
3388       sd(tmp, Address(d));
3389       addi(d, d, wordSize);
3390     }
3391 
3392     void step_squaring() {
3393       // An extra ACC
3394       step();
3395       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3396     }
3397 
3398     void last_squaring(Register i) {
3399       Label dont;
3400       // if ((i & 1) == 0) {
3401       andi(t0, i, 0x1);
3402       bnez(t0, dont); {
3403         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3404         // Ra = *++Pa;
3405         // Rb = *--Pb;
3406         mulhu(Rhi_ab, Ra, Rb);
3407         mul(Rlo_ab, Ra, Rb);
3408         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3409       } bind(dont);
3410     }
3411 
3412     void extra_step_squaring() {
3413       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3414 
3415       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3416       // Rm = *++Pm;
3417       // Rn = *--Pn;
3418       mulhu(Rhi_mn, Rm, Rn);
3419       mul(Rlo_mn, Rm, Rn);
3420       addi(Pm, Pm, wordSize);
3421       ld(Rm, Address(Pm));
3422       addi(Pn, Pn, -wordSize);
3423       ld(Rn, Address(Pn));
3424     }
3425 
3426     void post1_squaring() {
3427       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3428 
3429       // *Pm = Rm = tmp0 * inv;
3430       mul(Rm, tmp0, inv);
3431       sd(Rm, Address(Pm));
3432 
3433       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3434       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3435       mulhu(Rhi_mn, Rm, Rn);
3436 
3437 #ifndef PRODUCT
3438       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3439       {
3440         mul(Rlo_mn, Rm, Rn);
3441         add(Rlo_mn, tmp0, Rlo_mn);
3442         Label ok;
3443         beqz(Rlo_mn, ok); {
3444           stop("broken Montgomery multiply");
3445         } bind(ok);
3446       }
3447 #endif
3448       // We have very carefully set things up so that
3449       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3450       // the lower half of Rm * Rn because we know the result already:
3451       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3452       // tmp0 != 0.  So, rather than do a mul and a cad we just set
3453       // the carry flag iff tmp0 is nonzero.
3454       //
3455       // mul(Rlo_mn, Rm, Rn);
3456       // cad(zr, tmp, Rlo_mn);
3457       addi(t0, tmp0, -1);
3458       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3459       cadc(tmp0, tmp1, Rhi_mn, t0);
3460       adc(tmp1, tmp2, zr, t0);
3461       mv(tmp2, zr);
3462     }
3463 
3464     // use t0 as carry
3465     void acc(Register Rhi, Register Rlo,
3466              Register tmp0, Register tmp1, Register tmp2) {
3467       cad(tmp0, tmp0, Rlo, t0);
3468       cadc(tmp1, tmp1, Rhi, t0);
3469       adc(tmp2, tmp2, zr, t0);
3470     }
3471 
3472   public:
3473     /**
3474      * Fast Montgomery multiplication.  The derivation of the
3475      * algorithm is in A Cryptographic Library for the Motorola
3476      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3477      *
3478      * Arguments:
3479      *
3480      * Inputs for multiplication:
3481      *   c_rarg0   - int array elements a
3482      *   c_rarg1   - int array elements b
3483      *   c_rarg2   - int array elements n (the modulus)
3484      *   c_rarg3   - int length
3485      *   c_rarg4   - int inv
3486      *   c_rarg5   - int array elements m (the result)
3487      *
3488      * Inputs for squaring:
3489      *   c_rarg0   - int array elements a
3490      *   c_rarg1   - int array elements n (the modulus)
3491      *   c_rarg2   - int length
3492      *   c_rarg3   - int inv
3493      *   c_rarg4   - int array elements m (the result)
3494      *
3495      */
3496     address generate_multiply() {
3497       Label argh, nothing;
3498       bind(argh);
3499       stop("MontgomeryMultiply total_allocation must be <= 8192");
3500 
3501       align(CodeEntryAlignment);
3502       address entry = pc();
3503 
3504       beqz(Rlen, nothing);
3505 
3506       enter();
3507 
3508       // Make room.
3509       li(Ra, 512);
3510       bgt(Rlen, Ra, argh);
3511       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3512       sub(Ra, sp, Ra);
3513       andi(sp, Ra, -2 * wordSize);
3514 
3515       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3516 
3517       {
3518         // Copy input args, reversing as we go.  We use Ra as a
3519         // temporary variable.
3520         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3521         if (!_squaring)
3522           reverse(Ra, Pb_base, Rlen, Ri, Rj);
3523         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3524       }
3525 
3526       // Push all call-saved registers and also Pm_base which we'll need
3527       // at the end.
3528       save_regs();
3529 
3530 #ifndef PRODUCT
3531       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3532       {
3533         ld(Rn, Address(Pn_base));
3534         mul(Rlo_mn, Rn, inv);
3535         li(t0, -1);
3536         Label ok;
3537         beq(Rlo_mn, t0, ok);
3538         stop("broken inverse in Montgomery multiply");
3539         bind(ok);
3540       }
3541 #endif
3542 
3543       mv(Pm_base, Ra);
3544 
3545       mv(tmp0, zr);
3546       mv(tmp1, zr);
3547       mv(tmp2, zr);
3548 
3549       block_comment("for (int i = 0; i < len; i++) {");
3550       mv(Ri, zr); {
3551         Label loop, end;
3552         bge(Ri, Rlen, end);
3553 
3554         bind(loop);
3555         pre1(Ri);
3556 
3557         block_comment("  for (j = i; j; j--) {"); {
3558           mv(Rj, Ri);
3559           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3560         } block_comment("  } // j");
3561 
3562         post1();
3563         addw(Ri, Ri, 1);
3564         blt(Ri, Rlen, loop);
3565         bind(end);
3566         block_comment("} // i");
3567       }
3568 
3569       block_comment("for (int i = len; i < 2*len; i++) {");
3570       mv(Ri, Rlen); {
3571         Label loop, end;
3572         slli(t0, Rlen, 1);
3573         bge(Ri, t0, end);
3574 
3575         bind(loop);
3576         pre2(Ri, Rlen);
3577 
3578         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3579           slliw(Rj, Rlen, 1);
3580           subw(Rj, Rj, Ri);
3581           subw(Rj, Rj, 1);
3582           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3583         } block_comment("  } // j");
3584 
3585         post2(Ri, Rlen);
3586         addw(Ri, Ri, 1);
3587         slli(t0, Rlen, 1);
3588         blt(Ri, t0, loop);
3589         bind(end);
3590       }
3591       block_comment("} // i");
3592 
3593       normalize(Rlen);
3594 
3595       mv(Ra, Pm_base);  // Save Pm_base in Ra
3596       restore_regs();  // Restore caller's Pm_base
3597 
3598       // Copy our result into caller's Pm_base
3599       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3600 
3601       leave();
3602       bind(nothing);
3603       ret();
3604 
3605       return entry;
3606     }
3607 
3608     /**
3609      *
3610      * Arguments:
3611      *
3612      * Inputs:
3613      *   c_rarg0   - int array elements a
3614      *   c_rarg1   - int array elements n (the modulus)
3615      *   c_rarg2   - int length
3616      *   c_rarg3   - int inv
3617      *   c_rarg4   - int array elements m (the result)
3618      *
3619      */
3620     address generate_square() {
3621       Label argh;
3622       bind(argh);
3623       stop("MontgomeryMultiply total_allocation must be <= 8192");
3624 
3625       align(CodeEntryAlignment);
3626       address entry = pc();
3627 
3628       enter();
3629 
3630       // Make room.
3631       li(Ra, 512);
3632       bgt(Rlen, Ra, argh);
3633       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3634       sub(Ra, sp, Ra);
3635       andi(sp, Ra, -2 * wordSize);
3636 
3637       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3638 
3639       {
3640         // Copy input args, reversing as we go.  We use Ra as a
3641         // temporary variable.
3642         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3643         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3644       }
3645 
3646       // Push all call-saved registers and also Pm_base which we'll need
3647       // at the end.
3648       save_regs();
3649 
3650       mv(Pm_base, Ra);
3651 
3652       mv(tmp0, zr);
3653       mv(tmp1, zr);
3654       mv(tmp2, zr);
3655 
3656       block_comment("for (int i = 0; i < len; i++) {");
3657       mv(Ri, zr); {
3658         Label loop, end;
3659         bind(loop);
3660         bge(Ri, Rlen, end);
3661 
3662         pre1(Ri);
3663 
3664         block_comment("for (j = (i+1)/2; j; j--) {"); {
3665           addi(Rj, Ri, 1);
3666           srliw(Rj, Rj, 1);
3667           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3668         } block_comment("  } // j");
3669 
3670         last_squaring(Ri);
3671 
3672         block_comment("  for (j = i/2; j; j--) {"); {
3673           srliw(Rj, Ri, 1);
3674           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3675         } block_comment("  } // j");
3676 
3677         post1_squaring();
3678         addi(Ri, Ri, 1);
3679         blt(Ri, Rlen, loop);
3680 
3681         bind(end);
3682         block_comment("} // i");
3683       }
3684 
3685       block_comment("for (int i = len; i < 2*len; i++) {");
3686       mv(Ri, Rlen); {
3687         Label loop, end;
3688         bind(loop);
3689         slli(t0, Rlen, 1);
3690         bge(Ri, t0, end);
3691 
3692         pre2(Ri, Rlen);
3693 
3694         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
3695           slli(Rj, Rlen, 1);
3696           sub(Rj, Rj, Ri);
3697           sub(Rj, Rj, 1);
3698           srliw(Rj, Rj, 1);
3699           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3700         } block_comment("  } // j");
3701 
3702         last_squaring(Ri);
3703 
3704         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
3705           slli(Rj, Rlen, 1);
3706           sub(Rj, Rj, Ri);
3707           srliw(Rj, Rj, 1);
3708           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3709         } block_comment("  } // j");
3710 
3711         post2(Ri, Rlen);
3712         addi(Ri, Ri, 1);
3713         slli(t0, Rlen, 1);
3714         blt(Ri, t0, loop);
3715 
3716         bind(end);
3717         block_comment("} // i");
3718       }
3719 
3720       normalize(Rlen);
3721 
3722       mv(Ra, Pm_base);  // Save Pm_base in Ra
3723       restore_regs();  // Restore caller's Pm_base
3724 
3725       // Copy our result into caller's Pm_base
3726       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3727 
3728       leave();
3729       ret();
3730 
3731       return entry;
3732     }
3733   };
3734 #endif // COMPILER2
3735 
3736   // Continuation point for throwing of implicit exceptions that are
3737   // not handled in the current activation. Fabricates an exception
3738   // oop and initiates normal exception dispatching in this
3739   // frame. Since we need to preserve callee-saved values (currently
3740   // only for C2, but done for C1 as well) we need a callee-saved oop
3741   // map and therefore have to make these stubs into RuntimeStubs
3742   // rather than BufferBlobs.  If the compiler needs all registers to
3743   // be preserved between the fault point and the exception handler
3744   // then it must assume responsibility for that in
3745   // AbstractCompiler::continuation_for_implicit_null_exception or
3746   // continuation_for_implicit_division_by_zero_exception. All other
3747   // implicit exceptions (e.g., NullPointerException or
3748   // AbstractMethodError on entry) are either at call sites or
3749   // otherwise assume that stack unwinding will be initiated, so
3750   // caller saved registers were assumed volatile in the compiler.
3751 
3752 #undef __
3753 #define __ masm->
3754 
3755   address generate_throw_exception(const char* name,
3756                                    address runtime_entry,
3757                                    Register arg1 = noreg,
3758                                    Register arg2 = noreg) {
3759     // Information about frame layout at time of blocking runtime call.
3760     // Note that we only have to preserve callee-saved registers since
3761     // the compilers are responsible for supplying a continuation point
3762     // if they expect all registers to be preserved.
3763     // n.b. riscv asserts that frame::arg_reg_save_area_bytes == 0
3764     assert_cond(runtime_entry != NULL);
3765     enum layout {
3766       fp_off = 0,
3767       fp_off2,
3768       return_off,
3769       return_off2,
3770       framesize // inclusive of return address
3771     };
3772 
3773     const int insts_size = 512;
3774     const int locs_size  = 64;
3775 
3776     CodeBuffer code(name, insts_size, locs_size);
3777     OopMapSet* oop_maps  = new OopMapSet();
3778     MacroAssembler* masm = new MacroAssembler(&code);
3779     assert_cond(oop_maps != NULL && masm != NULL);
3780 
3781     address start = __ pc();
3782 
3783     // This is an inlined and slightly modified version of call_VM
3784     // which has the ability to fetch the return PC out of
3785     // thread-local storage and also sets up last_Java_sp slightly
3786     // differently than the real call_VM
3787 
3788     __ enter(); // Save FP and RA before call
3789 
3790     assert(is_even(framesize / 2), "sp not 16-byte aligned");
3791 
3792     // ra and fp are already in place
3793     __ addi(sp, fp, 0 - ((unsigned)framesize << LogBytesPerInt)); // prolog
3794 
3795     int frame_complete = __ pc() - start;
3796 
3797     // Set up last_Java_sp and last_Java_fp
3798     address the_pc = __ pc();
3799     __ set_last_Java_frame(sp, fp, the_pc, t0);
3800 
3801     // Call runtime
3802     if (arg1 != noreg) {
3803       assert(arg2 != c_rarg1, "clobbered");
3804       __ mv(c_rarg1, arg1);
3805     }
3806     if (arg2 != noreg) {
3807       __ mv(c_rarg2, arg2);
3808     }
3809     __ mv(c_rarg0, xthread);
3810     BLOCK_COMMENT("call runtime_entry");
3811     int32_t offset = 0;
3812     __ movptr_with_offset(t0, runtime_entry, offset);
3813     __ jalr(x1, t0, offset);
3814 
3815     // Generate oop map
3816     OopMap* map = new OopMap(framesize, 0);
3817     assert_cond(map != NULL);
3818 
3819     oop_maps->add_gc_map(the_pc - start, map);
3820 
3821     __ reset_last_Java_frame(true);
3822 
3823     __ leave();
3824 
3825     // check for pending exceptions
3826 #ifdef ASSERT
3827     Label L;
3828     __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
3829     __ bnez(t0, L);
3830     __ should_not_reach_here();
3831     __ bind(L);
3832 #endif // ASSERT
3833     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3834 
3835 
3836     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3837     RuntimeStub* stub =
3838       RuntimeStub::new_runtime_stub(name,
3839                                     &code,
3840                                     frame_complete,
3841                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3842                                     oop_maps, false);
3843     assert(stub != NULL, "create runtime stub fail!");
3844     return stub->entry_point();
3845   }
3846 
3847   // Initialization
3848   void generate_initial() {
3849     // Generate initial stubs and initializes the entry points
3850 
3851     // entry points that exist in all platforms Note: This is code
3852     // that could be shared among different platforms - however the
3853     // benefit seems to be smaller than the disadvantage of having a
3854     // much more complicated generator structure. See also comment in
3855     // stubRoutines.hpp.
3856 
3857     StubRoutines::_forward_exception_entry = generate_forward_exception();
3858 
3859     StubRoutines::_call_stub_entry =
3860       generate_call_stub(StubRoutines::_call_stub_return_address);
3861 
3862     // is referenced by megamorphic call
3863     StubRoutines::_catch_exception_entry = generate_catch_exception();
3864 
3865     // Build this early so it's available for the interpreter.
3866     StubRoutines::_throw_StackOverflowError_entry =
3867       generate_throw_exception("StackOverflowError throw_exception",
3868                                CAST_FROM_FN_PTR(address,
3869                                                 SharedRuntime::throw_StackOverflowError));
3870     StubRoutines::_throw_delayed_StackOverflowError_entry =
3871       generate_throw_exception("delayed StackOverflowError throw_exception",
3872                                CAST_FROM_FN_PTR(address,
3873                                                 SharedRuntime::throw_delayed_StackOverflowError));
3874     // Safefetch stubs.
3875     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
3876                                                        &StubRoutines::_safefetch32_fault_pc,
3877                                                        &StubRoutines::_safefetch32_continuation_pc);
3878     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
3879                                                        &StubRoutines::_safefetchN_fault_pc,
3880                                                        &StubRoutines::_safefetchN_continuation_pc);
3881   }
3882 
3883   void generate_all() {
3884     // support for verify_oop (must happen after universe_init)
3885     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
3886     StubRoutines::_throw_AbstractMethodError_entry =
3887       generate_throw_exception("AbstractMethodError throw_exception",
3888                                CAST_FROM_FN_PTR(address,
3889                                                 SharedRuntime::
3890                                                 throw_AbstractMethodError));
3891 
3892     StubRoutines::_throw_IncompatibleClassChangeError_entry =
3893       generate_throw_exception("IncompatibleClassChangeError throw_exception",
3894                                CAST_FROM_FN_PTR(address,
3895                                                 SharedRuntime::
3896                                                 throw_IncompatibleClassChangeError));
3897 
3898     StubRoutines::_throw_NullPointerException_at_call_entry =
3899       generate_throw_exception("NullPointerException at call throw_exception",
3900                                CAST_FROM_FN_PTR(address,
3901                                                 SharedRuntime::
3902                                                 throw_NullPointerException_at_call));
3903     // arraycopy stubs used by compilers
3904     generate_arraycopy_stubs();
3905 
3906 #ifdef COMPILER2
3907     if (UseMulAddIntrinsic) {
3908       StubRoutines::_mulAdd = generate_mulAdd();
3909     }
3910 
3911     if (UseMultiplyToLenIntrinsic) {
3912       StubRoutines::_multiplyToLen = generate_multiplyToLen();
3913     }
3914 
3915     if (UseSquareToLenIntrinsic) {
3916       StubRoutines::_squareToLen = generate_squareToLen();
3917     }
3918 
3919     if (UseMontgomeryMultiplyIntrinsic) {
3920       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
3921       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
3922       StubRoutines::_montgomeryMultiply = g.generate_multiply();
3923     }
3924 
3925     if (UseMontgomerySquareIntrinsic) {
3926       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
3927       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
3928       StubRoutines::_montgomerySquare = g.generate_square();
3929     }
3930 
3931     if (UseRVVForBigIntegerShiftIntrinsics) {
3932       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
3933       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
3934     }
3935 #endif
3936 
3937     generate_compare_long_strings();
3938 
3939     generate_string_indexof_stubs();
3940 
3941     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3942     if (bs_nm != NULL) {
3943       StubRoutines::riscv::_method_entry_barrier = generate_method_entry_barrier();
3944     }
3945 
3946     StubRoutines::riscv::set_completed();
3947   }
3948 
3949  public:
3950   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3951     if (all) {
3952       generate_all();
3953     } else {
3954       generate_initial();
3955     }
3956   }
3957 
3958   ~StubGenerator() {}
3959 }; // end class declaration
3960 
3961 #define UCM_TABLE_MAX_ENTRIES 8
3962 void StubGenerator_generate(CodeBuffer* code, bool all) {
3963   if (UnsafeCopyMemory::_table == NULL) {
3964     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
3965   }
3966 
3967   StubGenerator g(code, all);
3968 }