1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/universe.hpp"
  35 #include "nativeInst_riscv.hpp"
  36 #include "oops/instanceOop.hpp"
  37 #include "oops/method.hpp"
  38 #include "oops/objArrayKlass.hpp"
  39 #include "oops/oop.inline.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "runtime/frame.inline.hpp"
  42 #include "runtime/handles.inline.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubCodeGenerator.hpp"
  45 #include "runtime/stubRoutines.hpp"
  46 #include "runtime/thread.inline.hpp"
  47 #include "utilities/align.hpp"
  48 #include "utilities/powerOfTwo.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 #if INCLUDE_ZGC
  53 #include "gc/z/zThreadLocalData.hpp"
  54 #endif
  55 
  56 // Declaration and definition of StubGenerator (no .hpp file).
  57 // For a more detailed description of the stub routine structure
  58 // see the comment in stubRoutines.hpp
  59 
  60 #undef __
  61 #define __ _masm->
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(int& counter) {
  80     __ la(t1, ExternalAddress((address)&counter));
  81     __ lwu(t0, Address(t1, 0));
  82     __ addiw(t0, t0, 1);
  83     __ sw(t0, Address(t1, 0));
  84   }
  85 #define inc_counter_np(counter) \
  86   BLOCK_COMMENT("inc_counter " #counter); \
  87   inc_counter_np_(counter);
  88 #endif
  89 
  90   // Call stubs are used to call Java from C
  91   //
  92   // Arguments:
  93   //    c_rarg0:   call wrapper address                   address
  94   //    c_rarg1:   result                                 address
  95   //    c_rarg2:   result type                            BasicType
  96   //    c_rarg3:   method                                 Method*
  97   //    c_rarg4:   (interpreter) entry point              address
  98   //    c_rarg5:   parameters                             intptr_t*
  99   //    c_rarg6:   parameter size (in words)              int
 100   //    c_rarg7:   thread                                 Thread*
 101   //
 102   // There is no return from the stub itself as any Java result
 103   // is written to result
 104   //
 105   // we save x1 (ra) as the return PC at the base of the frame and
 106   // link x8 (fp) below it as the frame pointer installing sp (x2)
 107   // into fp.
 108   //
 109   // we save x10-x17, which accounts for all the c arguments.
 110   //
 111   // TODO: strictly do we need to save them all? they are treated as
 112   // volatile by C so could we omit saving the ones we are going to
 113   // place in global registers (thread? method?) or those we only use
 114   // during setup of the Java call?
 115   //
 116   // we don't need to save x5 which C uses as an indirect result location
 117   // return register.
 118   //
 119   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 120   // volatile
 121   //
 122   // we save x18-x27 which Java uses as temporary registers and C
 123   // expects to be callee-save
 124   //
 125   // so the stub frame looks like this when we enter Java code
 126   //
 127   //     [ return_from_Java     ] <--- sp
 128   //     [ argument word n      ]
 129   //      ...
 130   // -22 [ argument word 1      ]
 131   // -21 [ saved x27            ] <--- sp_after_call
 132   // -20 [ saved x26            ]
 133   // -19 [ saved x25            ]
 134   // -18 [ saved x24            ]
 135   // -17 [ saved x23            ]
 136   // -16 [ saved x22            ]
 137   // -15 [ saved x21            ]
 138   // -14 [ saved x20            ]
 139   // -13 [ saved x19            ]
 140   // -12 [ saved x18            ]
 141   // -11 [ saved x9             ]
 142   // -10 [ call wrapper   (x10) ]
 143   //  -9 [ result         (x11) ]
 144   //  -8 [ result type    (x12) ]
 145   //  -7 [ method         (x13) ]
 146   //  -6 [ entry point    (x14) ]
 147   //  -5 [ parameters     (x15) ]
 148   //  -4 [ parameter size (x16) ]
 149   //  -3 [ thread         (x17) ]
 150   //  -2 [ saved fp       (x8)  ]
 151   //  -1 [ saved ra       (x1)  ]
 152   //   0 [                      ] <--- fp == saved sp (x2)
 153 
 154   // Call stub stack layout word offsets from fp
 155   enum call_stub_layout {
 156     sp_after_call_off  = -21,
 157 
 158     x27_off            = -21,
 159     x26_off            = -20,
 160     x25_off            = -19,
 161     x24_off            = -18,
 162     x23_off            = -17,
 163     x22_off            = -16,
 164     x21_off            = -15,
 165     x20_off            = -14,
 166     x19_off            = -13,
 167     x18_off            = -12,
 168     x9_off             = -11,
 169 
 170     call_wrapper_off   = -10,
 171     result_off         = -9,
 172     result_type_off    = -8,
 173     method_off         = -7,
 174     entry_point_off    = -6,
 175     parameters_off     = -5,
 176     parameter_size_off = -4,
 177     thread_off         = -3,
 178     fp_f               = -2,
 179     retaddr_off        = -1,
 180   };
 181 
 182   address generate_call_stub(address& return_address) {
 183     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 184            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 185            "adjust this code");
 186 
 187     StubCodeMark mark(this, "StubRoutines", "call_stub");
 188     address start = __ pc();
 189 
 190     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 191 
 192     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 193     const Address result        (fp, result_off         * wordSize);
 194     const Address result_type   (fp, result_type_off    * wordSize);
 195     const Address method        (fp, method_off         * wordSize);
 196     const Address entry_point   (fp, entry_point_off    * wordSize);
 197     const Address parameters    (fp, parameters_off     * wordSize);
 198     const Address parameter_size(fp, parameter_size_off * wordSize);
 199 
 200     const Address thread        (fp, thread_off         * wordSize);
 201 
 202     const Address x27_save      (fp, x27_off            * wordSize);
 203     const Address x26_save      (fp, x26_off            * wordSize);
 204     const Address x25_save      (fp, x25_off            * wordSize);
 205     const Address x24_save      (fp, x24_off            * wordSize);
 206     const Address x23_save      (fp, x23_off            * wordSize);
 207     const Address x22_save      (fp, x22_off            * wordSize);
 208     const Address x21_save      (fp, x21_off            * wordSize);
 209     const Address x20_save      (fp, x20_off            * wordSize);
 210     const Address x19_save      (fp, x19_off            * wordSize);
 211     const Address x18_save      (fp, x18_off            * wordSize);
 212 
 213     const Address x9_save       (fp, x9_off             * wordSize);
 214 
 215     // stub code
 216 
 217     address riscv_entry = __ pc();
 218 
 219     // set up frame and move sp to end of save area
 220     __ enter();
 221     __ addi(sp, fp, sp_after_call_off * wordSize);
 222 
 223     // save register parameters and Java temporary/global registers
 224     // n.b. we save thread even though it gets installed in
 225     // xthread because we want to sanity check tp later
 226     __ sd(c_rarg7, thread);
 227     __ sw(c_rarg6, parameter_size);
 228     __ sd(c_rarg5, parameters);
 229     __ sd(c_rarg4, entry_point);
 230     __ sd(c_rarg3, method);
 231     __ sd(c_rarg2, result_type);
 232     __ sd(c_rarg1, result);
 233     __ sd(c_rarg0, call_wrapper);
 234 
 235     __ sd(x9, x9_save);
 236 
 237     __ sd(x18, x18_save);
 238     __ sd(x19, x19_save);
 239     __ sd(x20, x20_save);
 240     __ sd(x21, x21_save);
 241     __ sd(x22, x22_save);
 242     __ sd(x23, x23_save);
 243     __ sd(x24, x24_save);
 244     __ sd(x25, x25_save);
 245     __ sd(x26, x26_save);
 246     __ sd(x27, x27_save);
 247 
 248     // install Java thread in global register now we have saved
 249     // whatever value it held
 250     __ mv(xthread, c_rarg7);
 251 
 252     // And method
 253     __ mv(xmethod, c_rarg3);
 254 
 255     // set up the heapbase register
 256     __ reinit_heapbase();
 257 
 258 #ifdef ASSERT
 259     // make sure we have no pending exceptions
 260     {
 261       Label L;
 262       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 263       __ beqz(t0, L);
 264       __ stop("StubRoutines::call_stub: entered with pending exception");
 265       __ BIND(L);
 266     }
 267 #endif
 268     // pass parameters if any
 269     __ mv(esp, sp);
 270     __ slli(t0, c_rarg6, LogBytesPerWord);
 271     __ sub(t0, sp, t0); // Move SP out of the way
 272     __ andi(sp, t0, -2 * wordSize);
 273 
 274     BLOCK_COMMENT("pass parameters if any");
 275     Label parameters_done;
 276     // parameter count is still in c_rarg6
 277     // and parameter pointer identifying param 1 is in c_rarg5
 278     __ beqz(c_rarg6, parameters_done);
 279 
 280     address loop = __ pc();
 281     __ ld(t0, c_rarg5, 0);
 282     __ addi(c_rarg5, c_rarg5, wordSize);
 283     __ addi(c_rarg6, c_rarg6, -1);
 284     __ push_reg(t0);
 285     __ bgtz(c_rarg6, loop);
 286 
 287     __ BIND(parameters_done);
 288 
 289     // call Java entry -- passing methdoOop, and current sp
 290     //      xmethod: Method*
 291     //      x30: sender sp
 292     BLOCK_COMMENT("call Java function");
 293     __ mv(x30, sp);
 294     __ jalr(c_rarg4);
 295 
 296     // save current address for use by exception handling code
 297 
 298     return_address = __ pc();
 299 
 300     // store result depending on type (everything that is not
 301     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 302     // n.b. this assumes Java returns an integral result in x10
 303     // and a floating result in j_farg0
 304     __ ld(j_rarg2, result);
 305     Label is_long, is_float, is_double, exit;
 306     __ ld(j_rarg1, result_type);
 307     __ li(t0, (u1)T_OBJECT);
 308     __ beq(j_rarg1, t0, is_long);
 309     __ li(t0, (u1)T_LONG);
 310     __ beq(j_rarg1, t0, is_long);
 311     __ li(t0, (u1)T_FLOAT);
 312     __ beq(j_rarg1, t0, is_float);
 313     __ li(t0, (u1)T_DOUBLE);
 314     __ beq(j_rarg1, t0, is_double);
 315 
 316     // handle T_INT case
 317     __ sw(x10, Address(j_rarg2));
 318 
 319     __ BIND(exit);
 320 
 321     // pop parameters
 322     __ addi(esp, fp, sp_after_call_off * wordSize);
 323 
 324 #ifdef ASSERT
 325     // verify that threads correspond
 326     {
 327       Label L, S;
 328       __ ld(t0, thread);
 329       __ bne(xthread, t0, S);
 330       __ get_thread(t0);
 331       __ beq(xthread, t0, L);
 332       __ BIND(S);
 333       __ stop("StubRoutines::call_stub: threads must correspond");
 334       __ BIND(L);
 335     }
 336 #endif
 337 
 338     // restore callee-save registers
 339     __ ld(x27, x27_save);
 340     __ ld(x26, x26_save);
 341     __ ld(x25, x25_save);
 342     __ ld(x24, x24_save);
 343     __ ld(x23, x23_save);
 344     __ ld(x22, x22_save);
 345     __ ld(x21, x21_save);
 346     __ ld(x20, x20_save);
 347     __ ld(x19, x19_save);
 348     __ ld(x18, x18_save);
 349 
 350     __ ld(x9, x9_save);
 351 
 352     __ ld(c_rarg0, call_wrapper);
 353     __ ld(c_rarg1, result);
 354     __ ld(c_rarg2, result_type);
 355     __ ld(c_rarg3, method);
 356     __ ld(c_rarg4, entry_point);
 357     __ ld(c_rarg5, parameters);
 358     __ ld(c_rarg6, parameter_size);
 359     __ ld(c_rarg7, thread);
 360 
 361     // leave frame and return to caller
 362     __ leave();
 363     __ ret();
 364 
 365     // handle return types different from T_INT
 366 
 367     __ BIND(is_long);
 368     __ sd(x10, Address(j_rarg2, 0));
 369     __ j(exit);
 370 
 371     __ BIND(is_float);
 372     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 373     __ j(exit);
 374 
 375     __ BIND(is_double);
 376     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 377     __ j(exit);
 378 
 379     return start;
 380   }
 381 
 382   // Return point for a Java call if there's an exception thrown in
 383   // Java code.  The exception is caught and transformed into a
 384   // pending exception stored in JavaThread that can be tested from
 385   // within the VM.
 386   //
 387   // Note: Usually the parameters are removed by the callee. In case
 388   // of an exception crossing an activation frame boundary, that is
 389   // not the case if the callee is compiled code => need to setup the
 390   // sp.
 391   //
 392   // x10: exception oop
 393 
 394   address generate_catch_exception() {
 395     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 396     address start = __ pc();
 397 
 398     // same as in generate_call_stub():
 399     const Address thread(fp, thread_off * wordSize);
 400 
 401 #ifdef ASSERT
 402     // verify that threads correspond
 403     {
 404       Label L, S;
 405       __ ld(t0, thread);
 406       __ bne(xthread, t0, S);
 407       __ get_thread(t0);
 408       __ beq(xthread, t0, L);
 409       __ bind(S);
 410       __ stop("StubRoutines::catch_exception: threads must correspond");
 411       __ bind(L);
 412     }
 413 #endif
 414 
 415     // set pending exception
 416     __ verify_oop(x10);
 417 
 418     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 419     __ mv(t0, (address)__FILE__);
 420     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 421     __ mv(t0, (int)__LINE__);
 422     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 423 
 424     // complete return to VM
 425     assert(StubRoutines::_call_stub_return_address != NULL,
 426            "_call_stub_return_address must have been generated before");
 427     __ j(StubRoutines::_call_stub_return_address);
 428 
 429     return start;
 430   }
 431 
 432   // Continuation point for runtime calls returning with a pending
 433   // exception.  The pending exception check happened in the runtime
 434   // or native call stub.  The pending exception in Thread is
 435   // converted into a Java-level exception.
 436   //
 437   // Contract with Java-level exception handlers:
 438   // x10: exception
 439   // x13: throwing pc
 440   //
 441   // NOTE: At entry of this stub, exception-pc must be in RA !!
 442 
 443   // NOTE: this is always used as a jump target within generated code
 444   // so it just needs to be generated code with no x86 prolog
 445 
 446   address generate_forward_exception() {
 447     StubCodeMark mark(this, "StubRoutines", "forward exception");
 448     address start = __ pc();
 449 
 450     // Upon entry, RA points to the return address returning into
 451     // Java (interpreted or compiled) code; i.e., the return address
 452     // becomes the throwing pc.
 453     //
 454     // Arguments pushed before the runtime call are still on the stack
 455     // but the exception handler will reset the stack pointer ->
 456     // ignore them.  A potential result in registers can be ignored as
 457     // well.
 458 
 459 #ifdef ASSERT
 460     // make sure this code is only executed if there is a pending exception
 461     {
 462       Label L;
 463       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 464       __ bnez(t0, L);
 465       __ stop("StubRoutines::forward exception: no pending exception (1)");
 466       __ bind(L);
 467     }
 468 #endif
 469 
 470     // compute exception handler into x9
 471 
 472     // call the VM to find the handler address associated with the
 473     // caller address. pass thread in x10 and caller pc (ret address)
 474     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 475     // the stack.
 476     __ mv(c_rarg1, ra);
 477     // ra will be trashed by the VM call so we move it to x9
 478     // (callee-saved) because we also need to pass it to the handler
 479     // returned by this call.
 480     __ mv(x9, ra);
 481     BLOCK_COMMENT("call exception_handler_for_return_address");
 482     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 483                          SharedRuntime::exception_handler_for_return_address),
 484                     xthread, c_rarg1);
 485     // we should not really care that ra is no longer the callee
 486     // address. we saved the value the handler needs in x9 so we can
 487     // just copy it to x13. however, the C2 handler will push its own
 488     // frame and then calls into the VM and the VM code asserts that
 489     // the PC for the frame above the handler belongs to a compiled
 490     // Java method. So, we restore ra here to satisfy that assert.
 491     __ mv(ra, x9);
 492     // setup x10 & x13 & clear pending exception
 493     __ mv(x13, x9);
 494     __ mv(x9, x10);
 495     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 496     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 497 
 498 #ifdef ASSERT
 499     // make sure exception is set
 500     {
 501       Label L;
 502       __ bnez(x10, L);
 503       __ stop("StubRoutines::forward exception: no pending exception (2)");
 504       __ bind(L);
 505     }
 506 #endif
 507 
 508     // continue at exception handler
 509     // x10: exception
 510     // x13: throwing pc
 511     // x9: exception handler
 512     __ verify_oop(x10);
 513     __ jr(x9);
 514 
 515     return start;
 516   }
 517 
 518   // Non-destructive plausibility checks for oops
 519   //
 520   // Arguments:
 521   //    x10: oop to verify
 522   //    t0: error message
 523   //
 524   // Stack after saving c_rarg3:
 525   //    [tos + 0]: saved c_rarg3
 526   //    [tos + 1]: saved c_rarg2
 527   //    [tos + 2]: saved ra
 528   //    [tos + 3]: saved t1
 529   //    [tos + 4]: saved x10
 530   //    [tos + 5]: saved t0
 531   address generate_verify_oop() {
 532 
 533     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 534     address start = __ pc();
 535 
 536     Label exit, error;
 537 
 538     __ push_reg(0x3000, sp);   // save c_rarg2 and c_rarg3
 539 
 540     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 541     __ ld(c_rarg3, Address(c_rarg2));
 542     __ add(c_rarg3, c_rarg3, 1);
 543     __ sd(c_rarg3, Address(c_rarg2));
 544 
 545     // object is in x10
 546     // make sure object is 'reasonable'
 547     __ beqz(x10, exit); // if obj is NULL it is OK
 548 
 549 #if INCLUDE_ZGC
 550     if (UseZGC) {
 551       // Check if mask is good.
 552       // verifies that ZAddressBadMask & x10 == 0
 553       __ ld(c_rarg3, Address(xthread, ZThreadLocalData::address_bad_mask_offset()));
 554       __ andr(c_rarg2, x10, c_rarg3);
 555       __ bnez(c_rarg2, error);
 556     }
 557 #endif
 558 
 559     // Check if the oop is in the right area of memory
 560     __ mv(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 561     __ andr(c_rarg2, x10, c_rarg3);
 562     __ mv(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 563 
 564     // Compare c_rarg2 and c_rarg3.
 565     __ bne(c_rarg2, c_rarg3, error);
 566 
 567     // make sure klass is 'reasonable', which is not zero.
 568     __ load_klass(x10, x10);  // get klass
 569     __ beqz(x10, error);      // if klass is NULL it is broken
 570 
 571     // return if everything seems ok
 572     __ bind(exit);
 573 
 574     __ pop_reg(0x3000, sp);   // pop c_rarg2 and c_rarg3
 575     __ ret();
 576 
 577     // handle errors
 578     __ bind(error);
 579     __ pop_reg(0x3000, sp);   // pop c_rarg2 and c_rarg3
 580 
 581     __ pusha();
 582     // debug(char* msg, int64_t pc, int64_t regs[])
 583     __ mv(c_rarg0, t0);             // pass address of error message
 584     __ mv(c_rarg1, ra);             // pass return address
 585     __ mv(c_rarg2, sp);             // pass address of regs on stack
 586 #ifndef PRODUCT
 587     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 588 #endif
 589     BLOCK_COMMENT("call MacroAssembler::debug");
 590     int32_t offset = 0;
 591     __ movptr_with_offset(t0, CAST_FROM_FN_PTR(address, MacroAssembler::debug64), offset);
 592     __ jalr(x1, t0, offset);
 593     __ ebreak();
 594 
 595     return start;
 596   }
 597 
 598   // The inner part of zero_words().
 599   //
 600   // Inputs:
 601   // x28: the HeapWord-aligned base address of an array to zero.
 602   // x29: the count in HeapWords, x29 > 0.
 603   //
 604   // Returns x28 and x29, adjusted for the caller to clear.
 605   // x28: the base address of the tail of words left to clear.
 606   // x29: the number of words in the tail.
 607   //      x29 < MacroAssembler::zero_words_block_size.
 608 
 609   address generate_zero_blocks() {
 610     Label done;
 611 
 612     const Register base = x28, cnt = x29;
 613 
 614     __ align(CodeEntryAlignment);
 615     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 616     address start = __ pc();
 617 
 618     {
 619       // Clear the remaining blocks.
 620       Label loop;
 621       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 622       __ bltz(cnt, done);
 623       __ bind(loop);
 624       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 625         __ sd(zr, Address(base, 0));
 626         __ add(base, base, 8);
 627       }
 628       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 629       __ bgez(cnt, loop);
 630       __ bind(done);
 631       __ add(cnt, cnt, MacroAssembler::zero_words_block_size);
 632     }
 633 
 634     __ ret();
 635 
 636     return start;
 637   }
 638 
 639   typedef enum {
 640     copy_forwards = 1,
 641     copy_backwards = -1
 642   } copy_direction;
 643 
 644   // Bulk copy of blocks of 8 words.
 645   //
 646   // count is a count of words.
 647   //
 648   // Precondition: count >= 8
 649   //
 650   // Postconditions:
 651   //
 652   // The least significant bit of count contains the remaining count
 653   // of words to copy.  The rest of count is trash.
 654   //
 655   // s and d are adjusted to point to the remaining words to copy
 656   //
 657   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 658                            copy_direction direction) {
 659     int unit = wordSize * direction;
 660     int bias = wordSize;
 661 
 662     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 663       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 664 
 665     const Register stride = x30;
 666 
 667     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 668       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 669     assert_different_registers(s, d, count, t0);
 670 
 671     Label again, drain;
 672     const char* stub_name = NULL;
 673     if (direction == copy_forwards) {
 674       stub_name = "forward_copy_longs";
 675     } else {
 676       stub_name = "backward_copy_longs";
 677     }
 678     StubCodeMark mark(this, "StubRoutines", stub_name);
 679     __ align(CodeEntryAlignment);
 680     __ bind(start);
 681 
 682     if (direction == copy_forwards) {
 683       __ sub(s, s, bias);
 684       __ sub(d, d, bias);
 685     }
 686 
 687 #ifdef ASSERT
 688     // Make sure we are never given < 8 words
 689     {
 690       Label L;
 691 
 692       __ li(t0, 8);
 693       __ bge(count, t0, L);
 694       __ stop("genrate_copy_longs called with < 8 words");
 695       __ bind(L);
 696     }
 697 #endif
 698 
 699     __ ld(tmp_reg0, Address(s, 1 * unit));
 700     __ ld(tmp_reg1, Address(s, 2 * unit));
 701     __ ld(tmp_reg2, Address(s, 3 * unit));
 702     __ ld(tmp_reg3, Address(s, 4 * unit));
 703     __ ld(tmp_reg4, Address(s, 5 * unit));
 704     __ ld(tmp_reg5, Address(s, 6 * unit));
 705     __ ld(tmp_reg6, Address(s, 7 * unit));
 706     __ ld(tmp_reg7, Address(s, 8 * unit));
 707     __ addi(s, s, 8 * unit);
 708 
 709     __ sub(count, count, 16);
 710     __ bltz(count, drain);
 711 
 712     __ bind(again);
 713 
 714     __ sd(tmp_reg0, Address(d, 1 * unit));
 715     __ sd(tmp_reg1, Address(d, 2 * unit));
 716     __ sd(tmp_reg2, Address(d, 3 * unit));
 717     __ sd(tmp_reg3, Address(d, 4 * unit));
 718     __ sd(tmp_reg4, Address(d, 5 * unit));
 719     __ sd(tmp_reg5, Address(d, 6 * unit));
 720     __ sd(tmp_reg6, Address(d, 7 * unit));
 721     __ sd(tmp_reg7, Address(d, 8 * unit));
 722 
 723     __ ld(tmp_reg0, Address(s, 1 * unit));
 724     __ ld(tmp_reg1, Address(s, 2 * unit));
 725     __ ld(tmp_reg2, Address(s, 3 * unit));
 726     __ ld(tmp_reg3, Address(s, 4 * unit));
 727     __ ld(tmp_reg4, Address(s, 5 * unit));
 728     __ ld(tmp_reg5, Address(s, 6 * unit));
 729     __ ld(tmp_reg6, Address(s, 7 * unit));
 730     __ ld(tmp_reg7, Address(s, 8 * unit));
 731 
 732     __ addi(s, s, 8 * unit);
 733     __ addi(d, d, 8 * unit);
 734 
 735     __ sub(count, count, 8);
 736     __ bgez(count, again);
 737 
 738     // Drain
 739     __ bind(drain);
 740 
 741     __ sd(tmp_reg0, Address(d, 1 * unit));
 742     __ sd(tmp_reg1, Address(d, 2 * unit));
 743     __ sd(tmp_reg2, Address(d, 3 * unit));
 744     __ sd(tmp_reg3, Address(d, 4 * unit));
 745     __ sd(tmp_reg4, Address(d, 5 * unit));
 746     __ sd(tmp_reg5, Address(d, 6 * unit));
 747     __ sd(tmp_reg6, Address(d, 7 * unit));
 748     __ sd(tmp_reg7, Address(d, 8 * unit));
 749     __ addi(d, d, 8 * unit);
 750 
 751     {
 752       Label L1, L2;
 753       __ andi(t0, count, 4);
 754       __ beqz(t0, L1);
 755 
 756       __ ld(tmp_reg0, Address(s, 1 * unit));
 757       __ ld(tmp_reg1, Address(s, 2 * unit));
 758       __ ld(tmp_reg2, Address(s, 3 * unit));
 759       __ ld(tmp_reg3, Address(s, 4 * unit));
 760       __ addi(s, s, 4 * unit);
 761 
 762       __ sd(tmp_reg0, Address(d, 1 * unit));
 763       __ sd(tmp_reg1, Address(d, 2 * unit));
 764       __ sd(tmp_reg2, Address(d, 3 * unit));
 765       __ sd(tmp_reg3, Address(d, 4 * unit));
 766       __ addi(d, d, 4 * unit);
 767 
 768       __ bind(L1);
 769 
 770       if (direction == copy_forwards) {
 771         __ addi(s, s, bias);
 772         __ addi(d, d, bias);
 773       }
 774 
 775       __ andi(t0, count, 2);
 776       __ beqz(t0, L2);
 777       if (direction == copy_backwards) {
 778         __ addi(s, s, 2 * unit);
 779         __ ld(tmp_reg0, Address(s));
 780         __ ld(tmp_reg1, Address(s, wordSize));
 781         __ addi(d, d, 2 * unit);
 782         __ sd(tmp_reg0, Address(d));
 783         __ sd(tmp_reg1, Address(d, wordSize));
 784       } else {
 785         __ ld(tmp_reg0, Address(s));
 786         __ ld(tmp_reg1, Address(s, wordSize));
 787         __ addi(s, s, 2 * unit);
 788         __ sd(tmp_reg0, Address(d));
 789         __ sd(tmp_reg1, Address(d, wordSize));
 790         __ addi(d, d, 2 * unit);
 791       }
 792       __ bind(L2);
 793     }
 794 
 795     __ ret();
 796   }
 797 
 798   Label copy_f, copy_b;
 799 
 800   // All-singing all-dancing memory copy.
 801   //
 802   // Copy count units of memory from s to d.  The size of a unit is
 803   // step, which can be positive or negative depending on the direction
 804   // of copy.  If is_aligned is false, we align the source address.
 805   //
 806   /*
 807    * if (is_aligned) {
 808    *   goto copy_8_bytes;
 809    * }
 810    * bool is_backwards = step < 0;
 811    * int granularity = uabs(step);
 812    * count = count  *  granularity;   * count bytes
 813    *
 814    * if (is_backwards) {
 815    *   s += count;
 816    *   d += count;
 817    * }
 818    *
 819    * count limit maybe greater than 16, for better performance
 820    * if (count < 16) {
 821    *   goto copy_small;
 822    * }
 823    *
 824    * if ((dst % 8) == (src % 8)) {
 825    *   aligned;
 826    *   goto copy8;
 827    * }
 828    *
 829    * copy_small:
 830    *   load element one by one;
 831    * done;
 832    */
 833 
 834   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 835 
 836   void copy_memory_v(Register s, Register d, Register count, Register tmp, int step) {
 837     bool is_backward = step < 0;
 838     int granularity = uabs(step);
 839 
 840     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 841     assert_different_registers(s, d, cnt, vl, tmp, tmp1, tmp2);
 842     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 843     Label loop_forward, loop_backward, done;
 844 
 845     __ mv(dst, d);
 846     __ mv(src, s);
 847     __ mv(cnt, count);
 848 
 849     __ bind(loop_forward);
 850     __ vsetvli(vl, cnt, sew, Assembler::m8);
 851     if (is_backward) {
 852       __ bne(vl, cnt, loop_backward);
 853     }
 854 
 855     __ vlex_v(v0, src, sew);
 856     __ sub(cnt, cnt, vl);
 857     __ slli(vl, vl, (int)sew);
 858     __ add(src, src, vl);
 859 
 860     __ vsex_v(v0, dst, sew);
 861     __ add(dst, dst, vl);
 862     __ bnez(cnt, loop_forward);
 863 
 864     if (is_backward) {
 865       __ j(done);
 866 
 867       __ bind(loop_backward);
 868       __ sub(tmp, cnt, vl);
 869       __ slli(tmp, tmp, sew);
 870       __ add(tmp1, s, tmp);
 871       __ vlex_v(v0, tmp1, sew);
 872       __ add(tmp2, d, tmp);
 873       __ vsex_v(v0, tmp2, sew);
 874       __ sub(cnt, cnt, vl);
 875       __ bnez(cnt, loop_forward);
 876       __ bind(done);
 877     }
 878   }
 879 
 880   void copy_memory(bool is_aligned, Register s, Register d,
 881                    Register count, Register tmp, int step) {
 882     if (UseRVV) {
 883       return copy_memory_v(s, d, count, tmp, step);
 884     }
 885 
 886     bool is_backwards = step < 0;
 887     int granularity = uabs(step);
 888 
 889     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17;
 890 
 891     Label same_aligned;
 892     Label copy8, copy_small, done;
 893 
 894     copy_insn ld_arr = NULL, st_arr = NULL;
 895     switch (granularity) {
 896       case 1 :
 897         ld_arr = (copy_insn)&MacroAssembler::lbu;
 898         st_arr = (copy_insn)&MacroAssembler::sb;
 899         break;
 900       case 2 :
 901         ld_arr = (copy_insn)&MacroAssembler::lhu;
 902         st_arr = (copy_insn)&MacroAssembler::sh;
 903         break;
 904       case 4 :
 905         ld_arr = (copy_insn)&MacroAssembler::lwu;
 906         st_arr = (copy_insn)&MacroAssembler::sw;
 907         break;
 908       case 8 :
 909         ld_arr = (copy_insn)&MacroAssembler::ld;
 910         st_arr = (copy_insn)&MacroAssembler::sd;
 911         break;
 912       default :
 913         ShouldNotReachHere();
 914     }
 915 
 916     __ beqz(count, done);
 917     __ slli(cnt, count, exact_log2(granularity));
 918     if (is_backwards) {
 919       __ add(src, s, cnt);
 920       __ add(dst, d, cnt);
 921     } else {
 922       __ mv(src, s);
 923       __ mv(dst, d);
 924     }
 925 
 926     if (is_aligned) {
 927       __ addi(tmp, cnt, -8);
 928       __ bgez(tmp, copy8);
 929       __ j(copy_small);
 930     }
 931 
 932     __ mv(tmp, 16);
 933     __ blt(cnt, tmp, copy_small);
 934 
 935     __ xorr(tmp, src, dst);
 936     __ andi(tmp, tmp, 0b111);
 937     __ bnez(tmp, copy_small);
 938 
 939     __ bind(same_aligned);
 940     __ andi(tmp, src, 0b111);
 941     __ beqz(tmp, copy8);
 942     if (is_backwards) {
 943       __ addi(src, src, step);
 944       __ addi(dst, dst, step);
 945     }
 946     (_masm->*ld_arr)(tmp3, Address(src), t0);
 947     (_masm->*st_arr)(tmp3, Address(dst), t0);
 948     if (!is_backwards) {
 949       __ addi(src, src, step);
 950       __ addi(dst, dst, step);
 951     }
 952     __ addi(cnt, cnt, -granularity);
 953     __ beqz(cnt, done);
 954     __ j(same_aligned);
 955 
 956     __ bind(copy8);
 957     if (is_backwards) {
 958       __ addi(src, src, -wordSize);
 959       __ addi(dst, dst, -wordSize);
 960     }
 961     __ ld(tmp3, Address(src));
 962     __ sd(tmp3, Address(dst));
 963     if (!is_backwards) {
 964       __ addi(src, src, wordSize);
 965       __ addi(dst, dst, wordSize);
 966     }
 967     __ addi(cnt, cnt, -wordSize);
 968     __ addi(tmp4, cnt, -8);
 969     __ bgez(tmp4, copy8); // cnt >= 8, do next loop
 970 
 971     __ beqz(cnt, done);
 972 
 973     __ bind(copy_small);
 974     if (is_backwards) {
 975       __ addi(src, src, step);
 976       __ addi(dst, dst, step);
 977     }
 978     (_masm->*ld_arr)(tmp3, Address(src), t0);
 979     (_masm->*st_arr)(tmp3, Address(dst), t0);
 980     if (!is_backwards) {
 981       __ addi(src, src, step);
 982       __ addi(dst, dst, step);
 983     }
 984     __ addi(cnt, cnt, -granularity);
 985     __ bgtz(cnt, copy_small);
 986 
 987     __ bind(done);
 988   }
 989 
 990   // Scan over array at a for count oops, verifying each one.
 991   // Preserves a and count, clobbers t0 and t1.
 992   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
 993     Label loop, end;
 994     __ mv(t1, zr);
 995     __ slli(t0, count, exact_log2(size));
 996     __ bind(loop);
 997     __ bgeu(t1, t0, end);
 998 
 999     __ add(temp, a, t1);
1000     if (size == (size_t)wordSize) {
1001       __ ld(temp, Address(temp, 0));
1002       __ verify_oop(temp);
1003     } else {
1004       __ lwu(temp, Address(temp, 0));
1005       __ decode_heap_oop(temp); // calls verify_oop
1006     }
1007     __ add(t1, t1, size);
1008     __ j(loop);
1009     __ bind(end);
1010   }
1011 
1012   // Arguments:
1013   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1014   //             ignored
1015   //   is_oop  - true => oop array, so generate store check code
1016   //   name    - stub name string
1017   //
1018   // Inputs:
1019   //   c_rarg0   - source array address
1020   //   c_rarg1   - destination array address
1021   //   c_rarg2   - element count, treated as ssize_t, can be zero
1022   //
1023   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1024   // the hardware handle it.  The two dwords within qwords that span
1025   // cache line boundaries will still be loaded and stored atomicly.
1026   //
1027   // Side Effects:
1028   //   disjoint_int_copy_entry is set to the no-overlap entry point
1029   //   used by generate_conjoint_int_oop_copy().
1030   //
1031   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry,
1032                                  const char* name, bool dest_uninitialized = false) {
1033     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1034     RegSet saved_reg = RegSet::of(s, d, count);
1035     __ align(CodeEntryAlignment);
1036     StubCodeMark mark(this, "StubRoutines", name);
1037     address start = __ pc();
1038     __ enter();
1039 
1040     if (entry != NULL) {
1041       *entry = __ pc();
1042       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1043       BLOCK_COMMENT("Entry:");
1044     }
1045 
1046     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1047     if (dest_uninitialized) {
1048       decorators |= IS_DEST_UNINITIALIZED;
1049     }
1050     if (aligned) {
1051       decorators |= ARRAYCOPY_ALIGNED;
1052     }
1053 
1054     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1055     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1056 
1057     if (is_oop) {
1058       // save regs before copy_memory
1059       __ push_reg(RegSet::of(d, count), sp);
1060     }
1061 
1062     {
1063       // UnsafeCopyMemory page error: continue after ucm
1064       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1065       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1066       copy_memory(aligned, s, d, count, t0, size);
1067     }
1068 
1069     if (is_oop) {
1070       __ pop_reg(RegSet::of(d, count), sp);
1071       if (VerifyOops) {
1072         verify_oop_array(size, d, count, t2);
1073       }
1074     }
1075 
1076     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1077 
1078     __ leave();
1079     __ mv(x10, zr); // return 0
1080     __ ret();
1081     return start;
1082   }
1083 
1084   // Arguments:
1085   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1086   //             ignored
1087   //   is_oop  - true => oop array, so generate store check code
1088   //   name    - stub name string
1089   //
1090   // Inputs:
1091   //   c_rarg0   - source array address
1092   //   c_rarg1   - destination array address
1093   //   c_rarg2   - element count, treated as ssize_t, can be zero
1094   //
1095   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1096   // the hardware handle it.  The two dwords within qwords that span
1097   // cache line boundaries will still be loaded and stored atomicly.
1098   //
1099   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1100                                  address* entry, const char* name,
1101                                  bool dest_uninitialized = false) {
1102     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1103     RegSet saved_regs = RegSet::of(s, d, count);
1104     StubCodeMark mark(this, "StubRoutines", name);
1105     address start = __ pc();
1106     __ enter();
1107 
1108     if (entry != NULL) {
1109       *entry = __ pc();
1110       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1111       BLOCK_COMMENT("Entry:");
1112     }
1113 
1114     // use fwd copy when (d-s) above_equal (count*size)
1115     __ sub(t0, d, s);
1116     __ slli(t1, count, exact_log2(size));
1117     __ bgeu(t0, t1, nooverlap_target);
1118 
1119     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1120     if (dest_uninitialized) {
1121       decorators |= IS_DEST_UNINITIALIZED;
1122     }
1123     if (aligned) {
1124       decorators |= ARRAYCOPY_ALIGNED;
1125     }
1126 
1127     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1128     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1129 
1130     if (is_oop) {
1131       // save regs before copy_memory
1132       __ push_reg(RegSet::of(d, count), sp);
1133     }
1134 
1135     {
1136       // UnsafeCopyMemory page error: continue after ucm
1137       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1138       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1139       copy_memory(aligned, s, d, count, t0, -size);
1140     }
1141 
1142     if (is_oop) {
1143       __ pop_reg(RegSet::of(d, count), sp);
1144       if (VerifyOops) {
1145         verify_oop_array(size, d, count, t2);
1146       }
1147     }
1148     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1149     __ leave();
1150     __ mv(x10, zr); // return 0
1151     __ ret();
1152     return start;
1153   }
1154 
1155   // Arguments:
1156   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1157   //             ignored
1158   //   name    - stub name string
1159   //
1160   // Inputs:
1161   //   c_rarg0   - source array address
1162   //   c_rarg1   - destination array address
1163   //   c_rarg2   - element count, treated as ssize_t, can be zero
1164   //
1165   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1166   // we let the hardware handle it.  The one to eight bytes within words,
1167   // dwords or qwords that span cache line boundaries will still be loaded
1168   // and stored atomically.
1169   //
1170   // Side Effects:
1171   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1172   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1173   // we let the hardware handle it.  The one to eight bytes within words,
1174   // dwords or qwords that span cache line boundaries will still be loaded
1175   // and stored atomically.
1176   //
1177   // Side Effects:
1178   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1179   //   used by generate_conjoint_byte_copy().
1180   //
1181   address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) {
1182     const bool not_oop = false;
1183     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1184   }
1185 
1186   // Arguments:
1187   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1188   //             ignored
1189   //   name    - stub name string
1190   //
1191   // Inputs:
1192   //   c_rarg0   - source array address
1193   //   c_rarg1   - destination array address
1194   //   c_rarg2   - element count, treated as ssize_t, can be zero
1195   //
1196   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1197   // we let the hardware handle it.  The one to eight bytes within words,
1198   // dwords or qwords that span cache line boundaries will still be loaded
1199   // and stored atomically.
1200   //
1201   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1202                                       address* entry, const char* name) {
1203     const bool not_oop = false;
1204     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1205   }
1206 
1207   // Arguments:
1208   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1209   //             ignored
1210   //   name    - stub name string
1211   //
1212   // Inputs:
1213   //   c_rarg0   - source array address
1214   //   c_rarg1   - destination array address
1215   //   c_rarg2   - element count, treated as ssize_t, can be zero
1216   //
1217   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1218   // let the hardware handle it.  The two or four words within dwords
1219   // or qwords that span cache line boundaries will still be loaded
1220   // and stored atomically.
1221   //
1222   // Side Effects:
1223   //   disjoint_short_copy_entry is set to the no-overlap entry point
1224   //   used by generate_conjoint_short_copy().
1225   //
1226   address generate_disjoint_short_copy(bool aligned,
1227                                        address* entry, const char* name) {
1228     const bool not_oop = false;
1229     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1230   }
1231 
1232   // Arguments:
1233   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1234   //             ignored
1235   //   name    - stub name string
1236   //
1237   // Inputs:
1238   //   c_rarg0   - source array address
1239   //   c_rarg1   - destination array address
1240   //   c_rarg2   - element count, treated as ssize_t, can be zero
1241   //
1242   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1243   // let the hardware handle it.  The two or four words within dwords
1244   // or qwords that span cache line boundaries will still be loaded
1245   // and stored atomically.
1246   //
1247   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1248                                        address* entry, const char* name) {
1249     const bool not_oop = false;
1250     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1251   }
1252 
1253   // Arguments:
1254   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1255   //             ignored
1256   //   name    - stub name string
1257   //
1258   // Inputs:
1259   //   c_rarg0   - source array address
1260   //   c_rarg1   - destination array address
1261   //   c_rarg2   - element count, treated as ssize_t, can be zero
1262   //
1263   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1264   // the hardware handle it.  The two dwords within qwords that span
1265   // cache line boundaries will still be loaded and stored atomicly.
1266   //
1267   // Side Effects:
1268   //   disjoint_int_copy_entry is set to the no-overlap entry point
1269   //   used by generate_conjoint_int_oop_copy().
1270   //
1271   address generate_disjoint_int_copy(bool aligned, address* entry,
1272                                      const char* name, bool dest_uninitialized = false) {
1273     const bool not_oop = false;
1274     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1275   }
1276 
1277   // Arguments:
1278   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1279   //             ignored
1280   //   name    - stub name string
1281   //
1282   // Inputs:
1283   //   c_rarg0   - source array address
1284   //   c_rarg1   - destination array address
1285   //   c_rarg2   - element count, treated as ssize_t, can be zero
1286   //
1287   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1288   // the hardware handle it.  The two dwords within qwords that span
1289   // cache line boundaries will still be loaded and stored atomicly.
1290   //
1291   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1292                                      address* entry, const char* name,
1293                                      bool dest_uninitialized = false) {
1294     const bool not_oop = false;
1295     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1296   }
1297 
1298 
1299   // Arguments:
1300   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1301   //             ignored
1302   //   name    - stub name string
1303   //
1304   // Inputs:
1305   //   c_rarg0   - source array address
1306   //   c_rarg1   - destination array address
1307   //   c_rarg2   - element count, treated as size_t, can be zero
1308   //
1309   // Side Effects:
1310   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1311   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1312   //
1313   address generate_disjoint_long_copy(bool aligned, address* entry,
1314                                       const char* name, bool dest_uninitialized = false) {
1315     const bool not_oop = false;
1316     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1317   }
1318 
1319   // Arguments:
1320   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1321   //             ignored
1322   //   name    - stub name string
1323   //
1324   // Inputs:
1325   //   c_rarg0   - source array address
1326   //   c_rarg1   - destination array address
1327   //   c_rarg2   - element count, treated as size_t, can be zero
1328   //
1329   address generate_conjoint_long_copy(bool aligned,
1330                                       address nooverlap_target, address* entry,
1331                                       const char* name, bool dest_uninitialized = false) {
1332     const bool not_oop = false;
1333     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1334   }
1335 
1336   // Arguments:
1337   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1338   //             ignored
1339   //   name    - stub name string
1340   //
1341   // Inputs:
1342   //   c_rarg0   - source array address
1343   //   c_rarg1   - destination array address
1344   //   c_rarg2   - element count, treated as size_t, can be zero
1345   //
1346   // Side Effects:
1347   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1348   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1349   //
1350   address generate_disjoint_oop_copy(bool aligned, address* entry,
1351                                      const char* name, bool dest_uninitialized) {
1352     const bool is_oop = true;
1353     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1354     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1355   }
1356 
1357   // Arguments:
1358   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1359   //             ignored
1360   //   name    - stub name string
1361   //
1362   // Inputs:
1363   //   c_rarg0   - source array address
1364   //   c_rarg1   - destination array address
1365   //   c_rarg2   - element count, treated as size_t, can be zero
1366   //
1367   address generate_conjoint_oop_copy(bool aligned,
1368                                      address nooverlap_target, address* entry,
1369                                      const char* name, bool dest_uninitialized) {
1370     const bool is_oop = true;
1371     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1372     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1373                                   name, dest_uninitialized);
1374   }
1375 
1376   // Helper for generating a dynamic type check.
1377   // Smashes t0, t1.
1378   void generate_type_check(Register sub_klass,
1379                            Register super_check_offset,
1380                            Register super_klass,
1381                            Label& L_success) {
1382     assert_different_registers(sub_klass, super_check_offset, super_klass);
1383 
1384     BLOCK_COMMENT("type_check:");
1385 
1386     Label L_miss;
1387 
1388     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, super_check_offset);
1389     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1390 
1391     // Fall through on failure!
1392     __ BIND(L_miss);
1393   }
1394 
1395   //
1396   //  Generate checkcasting array copy stub
1397   //
1398   //  Input:
1399   //    c_rarg0   - source array address
1400   //    c_rarg1   - destination array address
1401   //    c_rarg2   - element count, treated as ssize_t, can be zero
1402   //    c_rarg3   - size_t ckoff (super_check_offset)
1403   //    c_rarg4   - oop ckval (super_klass)
1404   //
1405   //  Output:
1406   //    x10 ==  0  -  success
1407   //    x10 == -1^K - failure, where K is partial transfer count
1408   //
1409   address generate_checkcast_copy(const char* name, address* entry,
1410                                   bool dest_uninitialized = false) {
1411     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1412 
1413     // Input registers (after setup_arg_regs)
1414     const Register from        = c_rarg0;   // source array address
1415     const Register to          = c_rarg1;   // destination array address
1416     const Register count       = c_rarg2;   // elementscount
1417     const Register ckoff       = c_rarg3;   // super_check_offset
1418     const Register ckval       = c_rarg4;   // super_klass
1419 
1420     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1421     RegSet wb_post_saved_regs  = RegSet::of(count);
1422 
1423     // Registers used as temps (x7, x9, x18 are save-on-entry)
1424     const Register count_save  = x19;       // orig elementscount
1425     const Register start_to    = x18;       // destination array start address
1426     const Register copied_oop  = x7;        // actual oop copied
1427     const Register r9_klass    = x9;        // oop._klass
1428 
1429     //---------------------------------------------------------------
1430     // Assembler stub will be used for this call to arraycopy
1431     // if the two arrays are subtypes of Object[] but the
1432     // destination array type is not equal to or a supertype
1433     // of the source type.  Each element must be separately
1434     // checked.
1435 
1436     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1437                                copied_oop, r9_klass, count_save);
1438 
1439     __ align(CodeEntryAlignment);
1440     StubCodeMark mark(this, "StubRoutines", name);
1441     address start = __ pc();
1442 
1443     __ enter(); // required for proper stackwalking of RuntimeStub frame
1444 
1445     // Caller of this entry point must set up the argument registers.
1446     if (entry != NULL) {
1447       *entry = __ pc();
1448       BLOCK_COMMENT("Entry:");
1449     }
1450 
1451     // Empty array:  Nothing to do
1452     __ beqz(count, L_done);
1453 
1454     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1455 
1456 #ifdef ASSERT
1457     BLOCK_COMMENT("assert consistent ckoff/ckval");
1458     // The ckoff and ckval must be mutually consistent,
1459     // even though caller generates both.
1460     { Label L;
1461       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1462       __ lwu(start_to, Address(ckval, sco_offset));
1463       __ beq(ckoff, start_to, L);
1464       __ stop("super_check_offset inconsistent");
1465       __ bind(L);
1466     }
1467 #endif //ASSERT
1468 
1469     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1470     bool is_oop = true;
1471     if (dest_uninitialized) {
1472       decorators |= IS_DEST_UNINITIALIZED;
1473     }
1474 
1475     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1476     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1477 
1478     // save the original count
1479     __ mv(count_save, count);
1480 
1481     // Copy from low to high addresses
1482     __ mv(start_to, to);              // Save destination array start address
1483     __ j(L_load_element);
1484 
1485     // ======== begin loop ========
1486     // (Loop is rotated; its entry is L_load_element.)
1487     // Loop control:
1488     //   for count to 0 do
1489     //     copied_oop = load_heap_oop(from++)
1490     //     ... generate_type_check ...
1491     //     store_heap_oop(to++, copied_oop)
1492     //   end
1493 
1494     __ align(OptoLoopAlignment);
1495 
1496     __ BIND(L_store_element);
1497     __ store_heap_oop(Address(to, 0), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1498     __ add(to, to, UseCompressedOops ? 4 : 8);
1499     __ sub(count, count, 1);
1500     __ beqz(count, L_do_card_marks);
1501 
1502     // ======== loop entry is here ========
1503     __ BIND(L_load_element);
1504     __ load_heap_oop(copied_oop, Address(from, 0), noreg, noreg, AS_RAW); // load the oop
1505     __ add(from, from, UseCompressedOops ? 4 : 8);
1506     __ beqz(copied_oop, L_store_element);
1507 
1508     __ load_klass(r9_klass, copied_oop);// query the object klass
1509     generate_type_check(r9_klass, ckoff, ckval, L_store_element);
1510     // ======== end loop ========
1511 
1512     // It was a real error; we must depend on the caller to finish the job.
1513     // Register count = remaining oops, count_orig = total oops.
1514     // Emit GC store barriers for the oops we have copied and report
1515     // their number to the caller.
1516 
1517     __ sub(count, count_save, count);     // K = partially copied oop count
1518     __ xori(count, count, -1);                   // report (-1^K) to caller
1519     __ beqz(count, L_done_pop);
1520 
1521     __ BIND(L_do_card_marks);
1522     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
1523 
1524     __ bind(L_done_pop);
1525     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1526     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1527 
1528     __ bind(L_done);
1529     __ mv(x10, count);
1530     __ leave();
1531     __ ret();
1532 
1533     return start;
1534   }
1535 
1536   // Perform range checks on the proposed arraycopy.
1537   // Kills temp, but nothing else.
1538   // Also, clean the sign bits of src_pos and dst_pos.
1539   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1540                               Register src_pos, // source position (c_rarg1)
1541                               Register dst,     // destination array oo (c_rarg2)
1542                               Register dst_pos, // destination position (c_rarg3)
1543                               Register length,
1544                               Register temp,
1545                               Label& L_failed) {
1546     BLOCK_COMMENT("arraycopy_range_checks:");
1547 
1548     assert_different_registers(t0, temp);
1549 
1550     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1551     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1552     __ addw(temp, length, src_pos);
1553     __ bgtu(temp, t0, L_failed);
1554 
1555     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1556     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1557     __ addw(temp, length, dst_pos);
1558     __ bgtu(temp, t0, L_failed);
1559 
1560     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1561     __ zero_extend(src_pos, src_pos, 32);
1562     __ zero_extend(dst_pos, dst_pos, 32);
1563 
1564     BLOCK_COMMENT("arraycopy_range_checks done");
1565   }
1566 
1567   //
1568   //  Generate 'unsafe' array copy stub
1569   //  Though just as safe as the other stubs, it takes an unscaled
1570   //  size_t argument instead of an element count.
1571   //
1572   //  Input:
1573   //    c_rarg0   - source array address
1574   //    c_rarg1   - destination array address
1575   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1576   //
1577   // Examines the alignment of the operands and dispatches
1578   // to a long, int, short, or byte copy loop.
1579   //
1580   address generate_unsafe_copy(const char* name,
1581                                address byte_copy_entry,
1582                                address short_copy_entry,
1583                                address int_copy_entry,
1584                                address long_copy_entry) {
1585     assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL &&
1586                 int_copy_entry != NULL && long_copy_entry != NULL);
1587     Label L_long_aligned, L_int_aligned, L_short_aligned;
1588     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1589 
1590     __ align(CodeEntryAlignment);
1591     StubCodeMark mark(this, "StubRoutines", name);
1592     address start = __ pc();
1593     __ enter(); // required for proper stackwalking of RuntimeStub frame
1594 
1595     // bump this on entry, not on exit:
1596     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1597 
1598     __ orr(t0, s, d);
1599     __ orr(t0, t0, count);
1600 
1601     __ andi(t0, t0, BytesPerLong - 1);
1602     __ beqz(t0, L_long_aligned);
1603     __ andi(t0, t0, BytesPerInt - 1);
1604     __ beqz(t0, L_int_aligned);
1605     __ andi(t0, t0, 1);
1606     __ beqz(t0, L_short_aligned);
1607     __ j(RuntimeAddress(byte_copy_entry));
1608 
1609     __ BIND(L_short_aligned);
1610     __ srli(count, count, LogBytesPerShort);  // size => short_count
1611     __ j(RuntimeAddress(short_copy_entry));
1612     __ BIND(L_int_aligned);
1613     __ srli(count, count, LogBytesPerInt);    // size => int_count
1614     __ j(RuntimeAddress(int_copy_entry));
1615     __ BIND(L_long_aligned);
1616     __ srli(count, count, LogBytesPerLong);   // size => long_count
1617     __ j(RuntimeAddress(long_copy_entry));
1618 
1619     return start;
1620   }
1621 
1622   //
1623   //  Generate generic array copy stubs
1624   //
1625   //  Input:
1626   //    c_rarg0    -  src oop
1627   //    c_rarg1    -  src_pos (32-bits)
1628   //    c_rarg2    -  dst oop
1629   //    c_rarg3    -  dst_pos (32-bits)
1630   //    c_rarg4    -  element count (32-bits)
1631   //
1632   //  Output:
1633   //    x10 ==  0  -  success
1634   //    x10 == -1^K - failure, where K is partial transfer count
1635   //
1636   address generate_generic_copy(const char* name,
1637                                 address byte_copy_entry, address short_copy_entry,
1638                                 address int_copy_entry, address oop_copy_entry,
1639                                 address long_copy_entry, address checkcast_copy_entry) {
1640     assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL &&
1641                 int_copy_entry != NULL && oop_copy_entry != NULL &&
1642                 long_copy_entry != NULL && checkcast_copy_entry != NULL);
1643     Label L_failed, L_failed_0, L_objArray;
1644     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1645 
1646     // Input registers
1647     const Register src        = c_rarg0;  // source array oop
1648     const Register src_pos    = c_rarg1;  // source position
1649     const Register dst        = c_rarg2;  // destination array oop
1650     const Register dst_pos    = c_rarg3;  // destination position
1651     const Register length     = c_rarg4;
1652 
1653     // Registers used as temps
1654     const Register dst_klass = c_rarg5;
1655 
1656     __ align(CodeEntryAlignment);
1657 
1658     StubCodeMark mark(this, "StubRoutines", name);
1659 
1660     address start = __ pc();
1661 
1662     __ enter(); // required for proper stackwalking of RuntimeStub frame
1663 
1664     // bump this on entry, not on exit:
1665     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1666 
1667     //-----------------------------------------------------------------------
1668     // Assembler stub will be used for this call to arraycopy
1669     // if the following conditions are met:
1670     //
1671     // (1) src and dst must not be null.
1672     // (2) src_pos must not be negative.
1673     // (3) dst_pos must not be negative.
1674     // (4) length  must not be negative.
1675     // (5) src klass and dst klass should be the same and not NULL.
1676     // (6) src and dst should be arrays.
1677     // (7) src_pos + length must not exceed length of src.
1678     // (8) dst_pos + length must not exceed length of dst.
1679     //
1680 
1681     // if [src == NULL] then return -1
1682     __ beqz(src, L_failed);
1683 
1684     // if [src_pos < 0] then return -1
1685     // i.e. sign bit set
1686     __ andi(t0, src_pos, 1UL << 31);
1687     __ bnez(t0, L_failed);
1688 
1689     // if [dst == NULL] then return -1
1690     __ beqz(dst, L_failed);
1691 
1692     // if [dst_pos < 0] then return -1
1693     // i.e. sign bit set
1694     __ andi(t0, dst_pos, 1UL << 31);
1695     __ bnez(t0, L_failed);
1696 
1697     // registers used as temp
1698     const Register scratch_length    = x28; // elements count to copy
1699     const Register scratch_src_klass = x29; // array klass
1700     const Register lh                = x30; // layout helper
1701 
1702     // if [length < 0] then return -1
1703     __ addw(scratch_length, length, zr);    // length (elements count, 32-bits value)
1704     // i.e. sign bit set
1705     __ andi(t0, scratch_length, 1UL << 31);
1706     __ bnez(t0, L_failed);
1707 
1708     __ load_klass(scratch_src_klass, src);
1709 #ifdef ASSERT
1710     {
1711       BLOCK_COMMENT("assert klasses not null {");
1712       Label L1, L2;
1713       __ bnez(scratch_src_klass, L2);   // it is broken if klass is NULL
1714       __ bind(L1);
1715       __ stop("broken null klass");
1716       __ bind(L2);
1717       __ load_klass(t0, dst);
1718       __ beqz(t0, L1);     // this would be broken also
1719       BLOCK_COMMENT("} assert klasses not null done");
1720     }
1721 #endif
1722 
1723     // Load layout helper (32-bits)
1724     //
1725     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1726     // 32        30    24            16              8     2                 0
1727     //
1728     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1729     //
1730 
1731     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1732 
1733     // Handle objArrays completely differently...
1734     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1735     __ lw(lh, Address(scratch_src_klass, lh_offset));
1736     __ mvw(t0, objArray_lh);
1737     __ beq(lh, t0, L_objArray);
1738 
1739     // if [src->klass() != dst->klass()] then return -1
1740     __ load_klass(t1, dst);
1741     __ bne(t1, scratch_src_klass, L_failed);
1742 
1743     // if [src->is_Array() != NULL] then return -1
1744     // i.e. (lh >= 0)
1745     __ andi(t0, lh, 1UL << 31);
1746     __ beqz(t0, L_failed);
1747 
1748     // At this point, it is known to be a typeArray (array_tag 0x3).
1749 #ifdef ASSERT
1750     {
1751       BLOCK_COMMENT("assert primitive array {");
1752       Label L;
1753       __ mvw(t1, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
1754       __ bge(lh, t1, L);
1755       __ stop("must be a primitive array");
1756       __ bind(L);
1757       BLOCK_COMMENT("} assert primitive array done");
1758     }
1759 #endif
1760 
1761     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1762                            t1, L_failed);
1763 
1764     // TypeArrayKlass
1765     //
1766     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1767     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1768     //
1769 
1770     const Register t0_offset = t0;    // array offset
1771     const Register x22_elsize = lh;   // element size
1772 
1773     // Get array_header_in_bytes()
1774     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1775     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1776     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1777     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1778 
1779     __ add(src, src, t0_offset);           // src array offset
1780     __ add(dst, dst, t0_offset);           // dst array offset
1781     BLOCK_COMMENT("choose copy loop based on element size");
1782 
1783     // next registers should be set before the jump to corresponding stub
1784     const Register from     = c_rarg0;  // source array address
1785     const Register to       = c_rarg1;  // destination array address
1786     const Register count    = c_rarg2;  // elements count
1787 
1788     // 'from', 'to', 'count' registers should be set in such order
1789     // since they are the same as 'src', 'src_pos', 'dst'.
1790 
1791     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1792 
1793     // The possible values of elsize are 0-3, i.e. exact_log2(element
1794     // size in bytes).  We do a simple bitwise binary search.
1795   __ BIND(L_copy_bytes);
1796     __ andi(t0, x22_elsize, 2);
1797     __ bnez(t0, L_copy_ints);
1798     __ andi(t0, x22_elsize, 1);
1799     __ bnez(t0, L_copy_shorts);
1800     __ add(from, src, src_pos); // src_addr
1801     __ add(to, dst, dst_pos); // dst_addr
1802     __ addw(count, scratch_length, zr); // length
1803     __ j(RuntimeAddress(byte_copy_entry));
1804 
1805   __ BIND(L_copy_shorts);
1806     __ shadd(from, src_pos, src, t0, 1); // src_addr
1807     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1808     __ addw(count, scratch_length, zr); // length
1809     __ j(RuntimeAddress(short_copy_entry));
1810 
1811   __ BIND(L_copy_ints);
1812     __ andi(t0, x22_elsize, 1);
1813     __ bnez(t0, L_copy_longs);
1814     __ shadd(from, src_pos, src, t0, 2); // src_addr
1815     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
1816     __ addw(count, scratch_length, zr); // length
1817     __ j(RuntimeAddress(int_copy_entry));
1818 
1819   __ BIND(L_copy_longs);
1820 #ifdef ASSERT
1821     {
1822       BLOCK_COMMENT("assert long copy {");
1823       Label L;
1824       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x22_elsize
1825       __ addw(lh, lh, zr);
1826       __ mvw(t0, LogBytesPerLong);
1827       __ beq(x22_elsize, t0, L);
1828       __ stop("must be long copy, but elsize is wrong");
1829       __ bind(L);
1830       BLOCK_COMMENT("} assert long copy done");
1831     }
1832 #endif
1833     __ shadd(from, src_pos, src, t0, 3); // src_addr
1834     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
1835     __ addw(count, scratch_length, zr); // length
1836     __ j(RuntimeAddress(long_copy_entry));
1837 
1838     // ObjArrayKlass
1839   __ BIND(L_objArray);
1840     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1841 
1842     Label L_plain_copy, L_checkcast_copy;
1843     // test array classes for subtyping
1844     __ load_klass(t2, dst);
1845     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
1846 
1847     // Identically typed arrays can be copied without element-wise checks.
1848     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1849                            t1, L_failed);
1850 
1851     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1852     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1853     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1854     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1855     __ addw(count, scratch_length, zr); // length
1856   __ BIND(L_plain_copy);
1857     __ j(RuntimeAddress(oop_copy_entry));
1858 
1859   __ BIND(L_checkcast_copy);
1860     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
1861     {
1862       // Before looking at dst.length, make sure dst is also an objArray.
1863       __ lwu(t0, Address(t2, lh_offset));
1864       __ mvw(t1, objArray_lh);
1865       __ bne(t0, t1, L_failed);
1866 
1867       // It is safe to examine both src.length and dst.length.
1868       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1869                              t2, L_failed);
1870 
1871       __ load_klass(dst_klass, dst); // reload
1872 
1873       // Marshal the base address arguments now, freeing registers.
1874       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1875       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1876       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1877       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1878       __ addw(count, length, zr);           // length (reloaded)
1879       const Register sco_temp = c_rarg3;      // this register is free now
1880       assert_different_registers(from, to, count, sco_temp,
1881                                  dst_klass, scratch_src_klass);
1882 
1883       // Generate the type check.
1884       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1885       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1886 
1887       // Smashes t0, t1
1888       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
1889 
1890       // Fetch destination element klass from the ObjArrayKlass header.
1891       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1892       __ ld(dst_klass, Address(dst_klass, ek_offset));
1893       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1894 
1895       // the checkcast_copy loop needs two extra arguments:
1896       assert(c_rarg3 == sco_temp, "#3 already in place");
1897       // Set up arguments for checkcast_copy_entry.
1898       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
1899       __ j(RuntimeAddress(checkcast_copy_entry));
1900     }
1901 
1902   __ BIND(L_failed);
1903     __ li(x10, -1);
1904     __ leave();   // required for proper stackwalking of RuntimeStub frame
1905     __ ret();
1906 
1907     return start;
1908   }
1909 
1910   //
1911   // Generate stub for array fill. If "aligned" is true, the
1912   // "to" address is assumed to be heapword aligned.
1913   //
1914   // Arguments for generated stub:
1915   //   to:    c_rarg0
1916   //   value: c_rarg1
1917   //   count: c_rarg2 treated as signed
1918   //
1919   address generate_fill(BasicType t, bool aligned, const char* name) {
1920     __ align(CodeEntryAlignment);
1921     StubCodeMark mark(this, "StubRoutines", name);
1922     address start = __ pc();
1923 
1924     BLOCK_COMMENT("Entry:");
1925 
1926     const Register to        = c_rarg0;  // source array address
1927     const Register value     = c_rarg1;  // value
1928     const Register count     = c_rarg2;  // elements count
1929 
1930     const Register bz_base   = x28;      // base for block_zero routine
1931     const Register cnt_words = x29;      // temp register
1932     const Register tmp_reg   = t1;
1933 
1934     __ enter();
1935 
1936     Label L_fill_elements, L_exit1;
1937 
1938     int shift = -1;
1939     switch (t) {
1940       case T_BYTE:
1941         shift = 0;
1942 
1943         // Zero extend value
1944         // 8 bit -> 16 bit
1945         __ andi(value, value, 0xff);
1946         __ mv(tmp_reg, value);
1947         __ slli(tmp_reg, tmp_reg, 8);
1948         __ orr(value, value, tmp_reg);
1949 
1950         // 16 bit -> 32 bit
1951         __ mv(tmp_reg, value);
1952         __ slli(tmp_reg, tmp_reg, 16);
1953         __ orr(value, value, tmp_reg);
1954 
1955         __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1956         __ bltu(count, tmp_reg, L_fill_elements);
1957         break;
1958       case T_SHORT:
1959         shift = 1;
1960         // Zero extend value
1961         // 16 bit -> 32 bit
1962         __ andi(value, value, 0xffff);
1963         __ mv(tmp_reg, value);
1964         __ slli(tmp_reg, tmp_reg, 16);
1965         __ orr(value, value, tmp_reg);
1966 
1967         // Short arrays (< 8 bytes) fill by element
1968         __ mv(tmp_reg, 8 >> shift);
1969         __ bltu(count, tmp_reg, L_fill_elements);
1970         break;
1971       case T_INT:
1972         shift = 2;
1973 
1974         // Short arrays (< 8 bytes) fill by element
1975         __ mv(tmp_reg, 8 >> shift);
1976         __ bltu(count, tmp_reg, L_fill_elements);
1977         break;
1978       default: ShouldNotReachHere();
1979     }
1980 
1981     // Align source address at 8 bytes address boundary.
1982     Label L_skip_align1, L_skip_align2, L_skip_align4;
1983     if (!aligned) {
1984       switch (t) {
1985         case T_BYTE:
1986           // One byte misalignment happens only for byte arrays.
1987           __ andi(t0, to, 1);
1988           __ beqz(t0, L_skip_align1);
1989           __ sb(value, Address(to, 0));
1990           __ addi(to, to, 1);
1991           __ addiw(count, count, -1);
1992           __ bind(L_skip_align1);
1993           // Fallthrough
1994         case T_SHORT:
1995           // Two bytes misalignment happens only for byte and short (char) arrays.
1996           __ andi(t0, to, 2);
1997           __ beqz(t0, L_skip_align2);
1998           __ sh(value, Address(to, 0));
1999           __ addi(to, to, 2);
2000           __ addiw(count, count, -(2 >> shift));
2001           __ bind(L_skip_align2);
2002           // Fallthrough
2003         case T_INT:
2004           // Align to 8 bytes, we know we are 4 byte aligned to start.
2005           __ andi(t0, to, 4);
2006           __ beqz(t0, L_skip_align4);
2007           __ sw(value, Address(to, 0));
2008           __ addi(to, to, 4);
2009           __ addiw(count, count, -(4 >> shift));
2010           __ bind(L_skip_align4);
2011           break;
2012         default: ShouldNotReachHere();
2013       }
2014     }
2015 
2016     //
2017     //  Fill large chunks
2018     //
2019     __ srliw(cnt_words, count, 3 - shift); // number of words
2020 
2021     // 32 bit -> 64 bit
2022     __ andi(value, value, 0xffffffff);
2023     __ mv(tmp_reg, value);
2024     __ slli(tmp_reg, tmp_reg, 32);
2025     __ orr(value, value, tmp_reg);
2026 
2027     __ slli(tmp_reg, cnt_words, 3 - shift);
2028     __ subw(count, count, tmp_reg);
2029     {
2030       __ fill_words(to, cnt_words, value);
2031     }
2032 
2033     // Remaining count is less than 8 bytes. Fill it by a single store.
2034     // Note that the total length is no less than 8 bytes.
2035     if (t == T_BYTE || t == T_SHORT) {
2036       __ beqz(count, L_exit1);
2037       __ shadd(to, count, to, tmp_reg, shift); // points to the end
2038       __ sd(value, Address(to, -8)); // overwrite some elements
2039       __ bind(L_exit1);
2040       __ leave();
2041       __ ret();
2042     }
2043 
2044     // Handle copies less than 8 bytes.
2045     Label L_fill_2, L_fill_4, L_exit2;
2046     __ bind(L_fill_elements);
2047     switch (t) {
2048       case T_BYTE:
2049         __ andi(t0, count, 1);
2050         __ beqz(t0, L_fill_2);
2051         __ sb(value, Address(to, 0));
2052         __ addi(to, to, 1);
2053         __ bind(L_fill_2);
2054         __ andi(t0, count, 2);
2055         __ beqz(t0, L_fill_4);
2056         __ sh(value, Address(to, 0));
2057         __ addi(to, to, 2);
2058         __ bind(L_fill_4);
2059         __ andi(t0, count, 4);
2060         __ beqz(t0, L_exit2);
2061         __ sw(value, Address(to, 0));
2062         break;
2063       case T_SHORT:
2064         __ andi(t0, count, 1);
2065         __ beqz(t0, L_fill_4);
2066         __ sh(value, Address(to, 0));
2067         __ addi(to, to, 2);
2068         __ bind(L_fill_4);
2069         __ andi(t0, count, 2);
2070         __ beqz(t0, L_exit2);
2071         __ sw(value, Address(to, 0));
2072         break;
2073       case T_INT:
2074         __ beqz(count, L_exit2);
2075         __ sw(value, Address(to, 0));
2076         break;
2077       default: ShouldNotReachHere();
2078     }
2079     __ bind(L_exit2);
2080     __ leave();
2081     __ ret();
2082     return start;
2083   }
2084 
2085   void generate_arraycopy_stubs() {
2086     address entry                     = NULL;
2087     address entry_jbyte_arraycopy     = NULL;
2088     address entry_jshort_arraycopy    = NULL;
2089     address entry_jint_arraycopy      = NULL;
2090     address entry_oop_arraycopy       = NULL;
2091     address entry_jlong_arraycopy     = NULL;
2092     address entry_checkcast_arraycopy = NULL;
2093 
2094     generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards);
2095     generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards);
2096 
2097     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2098 
2099     //*** jbyte
2100     // Always need aligned and unaligned versions
2101     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, &entry,
2102                                                                                    "jbyte_disjoint_arraycopy");
2103     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_byte_copy(false, entry,
2104                                                                                    &entry_jbyte_arraycopy,
2105                                                                                    "jbyte_arraycopy");
2106     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, &entry,
2107                                                                                    "arrayof_jbyte_disjoint_arraycopy");
2108     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_byte_copy(true, entry, NULL,
2109                                                                                    "arrayof_jbyte_arraycopy");
2110 
2111     //*** jshort
2112     // Always need aligned and unaligned versions
2113     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2114                                                                                     "jshort_disjoint_arraycopy");
2115     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2116                                                                                     &entry_jshort_arraycopy,
2117                                                                                     "jshort_arraycopy");
2118     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2119                                                                                     "arrayof_jshort_disjoint_arraycopy");
2120     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2121                                                                                     "arrayof_jshort_arraycopy");
2122 
2123     //*** jint
2124     // Aligned versions
2125     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_copy(true, &entry,
2126                                                                                   "arrayof_jint_disjoint_arraycopy");
2127     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2128                                                                                   "arrayof_jint_arraycopy");
2129     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2130     // entry_jint_arraycopy always points to the unaligned version
2131     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_copy(false, &entry,
2132                                                                                   "jint_disjoint_arraycopy");
2133     StubRoutines::_jint_arraycopy                    = generate_conjoint_int_copy(false, entry,
2134                                                                                   &entry_jint_arraycopy,
2135                                                                                   "jint_arraycopy");
2136 
2137     //*** jlong
2138     // It is always aligned
2139     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, &entry,
2140                                                                                    "arrayof_jlong_disjoint_arraycopy");
2141     StubRoutines::_arrayof_jlong_arraycopy           = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2142                                                                                    "arrayof_jlong_arraycopy");
2143     StubRoutines::_jlong_disjoint_arraycopy          = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2144     StubRoutines::_jlong_arraycopy                   = StubRoutines::_arrayof_jlong_arraycopy;
2145 
2146     //*** oops
2147     {
2148       // With compressed oops we need unaligned versions; notice that
2149       // we overwrite entry_oop_arraycopy.
2150       bool aligned = !UseCompressedOops;
2151 
2152       StubRoutines::_arrayof_oop_disjoint_arraycopy
2153         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2154                                      /*dest_uninitialized*/false);
2155       StubRoutines::_arrayof_oop_arraycopy
2156         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2157                                      /*dest_uninitialized*/false);
2158       // Aligned versions without pre-barriers
2159       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2160         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2161                                      /*dest_uninitialized*/true);
2162       StubRoutines::_arrayof_oop_arraycopy_uninit
2163         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2164                                      /*dest_uninitialized*/true);
2165     }
2166 
2167     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2168     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2169     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2170     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2171 
2172     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2173     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2174                                                                         /*dest_uninitialized*/true);
2175 
2176 
2177     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2178                                                               entry_jbyte_arraycopy,
2179                                                               entry_jshort_arraycopy,
2180                                                               entry_jint_arraycopy,
2181                                                               entry_jlong_arraycopy);
2182 
2183     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2184                                                                entry_jbyte_arraycopy,
2185                                                                entry_jshort_arraycopy,
2186                                                                entry_jint_arraycopy,
2187                                                                entry_oop_arraycopy,
2188                                                                entry_jlong_arraycopy,
2189                                                                entry_checkcast_arraycopy);
2190 
2191     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2192     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2193     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2194     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2195     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2196     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2197   }
2198 
2199   // Safefetch stubs.
2200   void generate_safefetch(const char* name, int size, address* entry,
2201                           address* fault_pc, address* continuation_pc) {
2202     // safefetch signatures:
2203     //   int      SafeFetch32(int*      adr, int      errValue)
2204     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue)
2205     //
2206     // arguments:
2207     //   c_rarg0 = adr
2208     //   c_rarg1 = errValue
2209     //
2210     // result:
2211     //   PPC_RET  = *adr or errValue
2212     assert_cond(entry != NULL && fault_pc != NULL && continuation_pc != NULL);
2213     StubCodeMark mark(this, "StubRoutines", name);
2214 
2215     // Entry point, pc or function descriptor.
2216     *entry = __ pc();
2217 
2218     // Load *adr into c_rarg1, may fault.
2219     *fault_pc = __ pc();
2220     switch (size) {
2221       case 4:
2222         // int32_t
2223         __ lw(c_rarg1, Address(c_rarg0, 0));
2224         break;
2225       case 8:
2226         // int64_t
2227         __ ld(c_rarg1, Address(c_rarg0, 0));
2228         break;
2229       default:
2230         ShouldNotReachHere();
2231     }
2232 
2233     // return errValue or *adr
2234     *continuation_pc = __ pc();
2235     __ mv(x10, c_rarg1);
2236     __ ret();
2237   }
2238 
2239   // code for comparing 16 bytes of strings with same encoding
2240   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
2241     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31;
2242     __ ld(tmp5, Address(str1));
2243     __ addi(str1, str1, 8);
2244     __ xorr(tmp4, tmp1, tmp2);
2245     __ ld(cnt1, Address(str2));
2246     __ addi(str2, str2, 8);
2247     __ bnez(tmp4, DIFF1);
2248     __ ld(tmp1, Address(str1));
2249     __ addi(str1, str1, 8);
2250     __ xorr(tmp4, tmp5, cnt1);
2251     __ ld(tmp2, Address(str2));
2252     __ addi(str2, str2, 8);
2253     __ bnez(tmp4, DIFF2);
2254   }
2255 
2256   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
2257   void compare_string_8_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
2258                               Label &DIFF2) {
2259     const Register strU = x12, curU = x7, strL = x29, tmp = x30;
2260     __ ld(tmpL, Address(strL));
2261     __ addi(strL, strL, 8);
2262     __ ld(tmpU, Address(strU));
2263     __ addi(strU, strU, 8);
2264     __ inflate_lo32(tmp, tmpL);
2265     __ mv(t0, tmp);
2266     __ xorr(tmp, curU, t0);
2267     __ bnez(tmp, DIFF2);
2268 
2269     __ ld(curU, Address(strU));
2270     __ addi(strU, strU, 8);
2271     __ inflate_hi32(tmp, tmpL);
2272     __ mv(t0, tmp);
2273     __ xorr(tmp, tmpU, t0);
2274     __ bnez(tmp, DIFF1);
2275   }
2276 
2277   // x10  = result
2278   // x11  = str1
2279   // x12  = cnt1
2280   // x13  = str2
2281   // x14  = cnt2
2282   // x28  = tmp1
2283   // x29  = tmp2
2284   // x30  = tmp3
2285   address generate_compare_long_string_different_encoding(bool isLU) {
2286     __ align(CodeEntryAlignment);
2287     StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL");
2288     address entry = __ pc();
2289     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
2290           DONE, CALCULATE_DIFFERENCE;
2291     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2292                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2293     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2294 
2295     // cnt2 == amount of characters left to compare
2296     // Check already loaded first 4 symbols
2297     __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2);
2298     __ mv(isLU ? tmp1 : tmp2, tmp3);
2299     __ addi(str1, str1, isLU ? wordSize / 2 : wordSize);
2300     __ addi(str2, str2, isLU ? wordSize : wordSize / 2);
2301     __ sub(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
2302     __ push_reg(spilled_regs, sp);
2303 
2304     if (isLU) {
2305       __ add(str1, str1, cnt2);
2306       __ shadd(str2, cnt2, str2, t0, 1);
2307     } else {
2308       __ shadd(str1, cnt2, str1, t0, 1);
2309       __ add(str2, str2, cnt2);
2310     }
2311     __ xorr(tmp3, tmp1, tmp2);
2312     __ mv(tmp5, tmp2);
2313     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2314 
2315     Register strU = isLU ? str2 : str1,
2316              strL = isLU ? str1 : str2,
2317              tmpU = isLU ? tmp5 : tmp1, // where to keep U for comparison
2318              tmpL = isLU ? tmp1 : tmp5; // where to keep L for comparison
2319 
2320     __ sub(tmp2, strL, cnt2); // strL pointer to load from
2321     __ slli(t0, cnt2, 1);
2322     __ sub(cnt1, strU, t0); // strU pointer to load from
2323 
2324     __ ld(tmp4, Address(cnt1));
2325     __ addi(cnt1, cnt1, 8);
2326     __ beqz(cnt2, LOAD_LAST); // no characters left except last load
2327     __ sub(cnt2, cnt2, 16);
2328     __ bltz(cnt2, TAIL);
2329     __ bind(SMALL_LOOP); // smaller loop
2330       __ sub(cnt2, cnt2, 16);
2331       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2332       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2333       __ bgez(cnt2, SMALL_LOOP);
2334       __ addi(t0, cnt2, 16);
2335       __ beqz(t0, LOAD_LAST);
2336     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
2337       // Address of 8 bytes before last 4 characters in UTF-16 string
2338       __ shadd(cnt1, cnt2, cnt1, t0, 1);
2339       // Address of 16 bytes before last 4 characters in Latin1 string
2340       __ add(tmp2, tmp2, cnt2);
2341       __ ld(tmp4, Address(cnt1, -8));
2342       // last 16 characters before last load
2343       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2344       compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
2345       __ j(LOAD_LAST);
2346     __ bind(DIFF2);
2347       __ mv(tmpU, tmp4);
2348     __ bind(DIFF1);
2349       __ mv(tmpL, t0);
2350       __ j(CALCULATE_DIFFERENCE);
2351     __ bind(LOAD_LAST);
2352       // Last 4 UTF-16 characters are already pre-loaded into tmp4 by compare_string_8_x_LU.
2353       // No need to load it again
2354       __ mv(tmpU, tmp4);
2355       __ ld(tmpL, Address(strL));
2356       __ inflate_lo32(tmp3, tmpL);
2357       __ mv(tmpL, tmp3);
2358       __ xorr(tmp3, tmpU, tmpL);
2359       __ beqz(tmp3, DONE);
2360 
2361       // Find the first different characters in the longwords and
2362       // compute their difference.
2363     __ bind(CALCULATE_DIFFERENCE);
2364       __ ctzc_bit(tmp4, tmp3);
2365       __ srl(tmp1, tmp1, tmp4);
2366       __ srl(tmp5, tmp5, tmp4);
2367       __ andi(tmp1, tmp1, 0xFFFF);
2368       __ andi(tmp5, tmp5, 0xFFFF);
2369       __ sub(result, tmp1, tmp5);
2370     __ bind(DONE);
2371       __ pop_reg(spilled_regs, sp);
2372       __ ret();
2373     return entry;
2374   }
2375 
2376   address generate_method_entry_barrier() {
2377     __ align(CodeEntryAlignment);
2378     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
2379 
2380     Label deoptimize_label;
2381 
2382     address start = __ pc();
2383 
2384     __ set_last_Java_frame(sp, fp, ra, t0);
2385 
2386     __ enter();
2387     __ add(t1, sp, wordSize);
2388 
2389     __ sub(sp, sp, 4 * wordSize);
2390 
2391     __ push_call_clobbered_registers();
2392 
2393     __ mv(c_rarg0, t1);
2394     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
2395 
2396     __ reset_last_Java_frame(true);
2397 
2398     __ mv(t0, x10);
2399 
2400     __ pop_call_clobbered_registers();
2401 
2402     __ bnez(t0, deoptimize_label);
2403 
2404     __ leave();
2405     __ ret();
2406 
2407     __ BIND(deoptimize_label);
2408 
2409     __ ld(t0, Address(sp, 0));
2410     __ ld(fp, Address(sp, wordSize));
2411     __ ld(ra, Address(sp, wordSize * 2));
2412     __ ld(t1, Address(sp, wordSize * 3));
2413 
2414     __ mv(sp, t0);
2415     __ jr(t1);
2416 
2417     return start;
2418   }
2419 
2420   // x10  = result
2421   // x11  = str1
2422   // x12  = cnt1
2423   // x13  = str2
2424   // x14  = cnt2
2425   // x28  = tmp1
2426   // x29  = tmp2
2427   // x30  = tmp3
2428   // x31  = tmp4
2429   address generate_compare_long_string_same_encoding(bool isLL) {
2430     __ align(CodeEntryAlignment);
2431     StubCodeMark mark(this, "StubRoutines", isLL ?
2432                       "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU");
2433     address entry = __ pc();
2434     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
2435           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
2436     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2437                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2438     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2439 
2440     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
2441     // update cnt2 counter with already loaded 8 bytes
2442     __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
2443     // update pointers, because of previous read
2444     __ add(str1, str1, wordSize);
2445     __ add(str2, str2, wordSize);
2446     // less than 16 bytes left?
2447     __ sub(cnt2, cnt2, isLL ? 16 : 8);
2448     __ push_reg(spilled_regs, sp);
2449     __ bltz(cnt2, TAIL);
2450     __ bind(SMALL_LOOP);
2451       compare_string_16_bytes_same(DIFF, DIFF2);
2452       __ sub(cnt2, cnt2, isLL ? 16 : 8);
2453       __ bgez(cnt2, SMALL_LOOP);
2454     __ bind(TAIL);
2455       __ addi(cnt2, cnt2, isLL ? 16 : 8);
2456       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
2457       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2458       __ blez(cnt2, CHECK_LAST);
2459       __ xorr(tmp4, tmp1, tmp2);
2460       __ bnez(tmp4, DIFF);
2461       __ ld(tmp1, Address(str1));
2462       __ addi(str1, str1, 8);
2463       __ ld(tmp2, Address(str2));
2464       __ addi(str2, str2, 8);
2465       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2466     __ bind(CHECK_LAST);
2467       if (!isLL) {
2468         __ add(cnt2, cnt2, cnt2); // now in bytes
2469       }
2470       __ xorr(tmp4, tmp1, tmp2);
2471       __ bnez(tmp4, DIFF);
2472       __ add(str1, str1, cnt2);
2473       __ ld(tmp5, Address(str1));
2474       __ add(str2, str2, cnt2);
2475       __ ld(cnt1, Address(str2));
2476       __ xorr(tmp4, tmp5, cnt1);
2477       __ beqz(tmp4, LENGTH_DIFF);
2478       // Find the first different characters in the longwords and
2479       // compute their difference.
2480     __ bind(DIFF2);
2481       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2482       __ srl(tmp5, tmp5, tmp3);
2483       __ srl(cnt1, cnt1, tmp3);
2484       if (isLL) {
2485         __ andi(tmp5, tmp5, 0xFF);
2486         __ andi(cnt1, cnt1, 0xFF);
2487       } else {
2488         __ andi(tmp5, tmp5, 0xFFFF);
2489         __ andi(cnt1, cnt1, 0xFFFF);
2490       }
2491       __ sub(result, tmp5, cnt1);
2492       __ j(LENGTH_DIFF);
2493     __ bind(DIFF);
2494       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2495       __ srl(tmp1, tmp1, tmp3);
2496       __ srl(tmp2, tmp2, tmp3);
2497       if (isLL) {
2498         __ andi(tmp1, tmp1, 0xFF);
2499         __ andi(tmp2, tmp2, 0xFF);
2500       } else {
2501         __ andi(tmp1, tmp1, 0xFFFF);
2502         __ andi(tmp2, tmp2, 0xFFFF);
2503       }
2504       __ sub(result, tmp1, tmp2);
2505       __ j(LENGTH_DIFF);
2506     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
2507       __ xorr(tmp4, tmp1, tmp2);
2508       __ bnez(tmp4, DIFF);
2509     __ bind(LENGTH_DIFF);
2510       __ pop_reg(spilled_regs, sp);
2511       __ ret();
2512     return entry;
2513   }
2514 
2515   void generate_compare_long_strings() {
2516     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true);
2517     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false);
2518     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true);
2519     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false);
2520   }
2521 
2522   // x10 result
2523   // x11 src
2524   // x12 src count
2525   // x13 pattern
2526   // x14 pattern count
2527   address generate_string_indexof_linear(bool needle_isL, bool haystack_isL)
2528   {
2529     const char* stubName = needle_isL
2530            ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul")
2531            : "indexof_linear_uu";
2532     __ align(CodeEntryAlignment);
2533     StubCodeMark mark(this, "StubRoutines", stubName);
2534     address entry = __ pc();
2535 
2536     int needle_chr_size = needle_isL ? 1 : 2;
2537     int haystack_chr_size = haystack_isL ? 1 : 2;
2538     int needle_chr_shift = needle_isL ? 0 : 1;
2539     int haystack_chr_shift = haystack_isL ? 0 : 1;
2540     bool isL = needle_isL && haystack_isL;
2541     // parameters
2542     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
2543     // temporary registers
2544     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
2545     // redefinitions
2546     Register ch1 = x28, ch2 = x29;
2547     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
2548 
2549     __ push_reg(spilled_regs, sp);
2550 
2551     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
2552           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
2553           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
2554           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
2555           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
2556           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
2557 
2558     __ ld(ch1, Address(needle));
2559     __ ld(ch2, Address(haystack));
2560     // src.length - pattern.length
2561     __ sub(haystack_len, haystack_len, needle_len);
2562 
2563     // first is needle[0]
2564     __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first);
2565     uint64_t mask0101 = UCONST64(0x0101010101010101);
2566     uint64_t mask0001 = UCONST64(0x0001000100010001);
2567     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
2568     __ mul(first, first, mask1);
2569     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
2570     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
2571     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
2572     if (needle_isL != haystack_isL) {
2573       __ mv(tmp, ch1);
2574     }
2575     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
2576     __ blez(haystack_len, L_SMALL);
2577 
2578     if (needle_isL != haystack_isL) {
2579       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2580     }
2581     // xorr, sub, orr, notr, andr
2582     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
2583     // eg:
2584     // first:        aa aa aa aa aa aa aa aa
2585     // ch2:          aa aa li nx jd ka aa aa
2586     // match_mask:   80 80 00 00 00 00 80 80
2587     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2588 
2589     // search first char of needle, if success, goto L_HAS_ZERO;
2590     __ bnez(match_mask, L_HAS_ZERO);
2591     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2592     __ add(result, result, wordSize / haystack_chr_size);
2593     __ add(haystack, haystack, wordSize);
2594     __ bltz(haystack_len, L_POST_LOOP);
2595 
2596     __ bind(L_LOOP);
2597     __ ld(ch2, Address(haystack));
2598     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2599     __ bnez(match_mask, L_HAS_ZERO);
2600 
2601     __ bind(L_LOOP_PROCEED);
2602     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2603     __ add(haystack, haystack, wordSize);
2604     __ add(result, result, wordSize / haystack_chr_size);
2605     __ bgez(haystack_len, L_LOOP);
2606 
2607     __ bind(L_POST_LOOP);
2608     __ mv(ch2, -wordSize / haystack_chr_size);
2609     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
2610     __ ld(ch2, Address(haystack));
2611     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2612     __ neg(haystack_len, haystack_len);
2613     __ xorr(ch2, first, ch2);
2614     __ sub(match_mask, ch2, mask1);
2615     __ orr(ch2, ch2, mask2);
2616     __ mv(trailing_zeros, -1); // all bits set
2617     __ j(L_SMALL_PROCEED);
2618 
2619     __ align(OptoLoopAlignment);
2620     __ bind(L_SMALL);
2621     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2622     __ neg(haystack_len, haystack_len);
2623     if (needle_isL != haystack_isL) {
2624       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2625     }
2626     __ xorr(ch2, first, ch2);
2627     __ sub(match_mask, ch2, mask1);
2628     __ orr(ch2, ch2, mask2);
2629     __ mv(trailing_zeros, -1); // all bits set
2630 
2631     __ bind(L_SMALL_PROCEED);
2632     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
2633     __ notr(ch2, ch2);
2634     __ andr(match_mask, match_mask, ch2);
2635     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
2636     __ beqz(match_mask, NOMATCH);
2637 
2638     __ bind(L_SMALL_HAS_ZERO_LOOP);
2639     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros
2640     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2641     __ mv(ch2, wordSize / haystack_chr_size);
2642     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
2643     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2644     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2645     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2646 
2647     __ bind(L_SMALL_CMP_LOOP);
2648     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
2649     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2650     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
2651     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2652     __ add(trailing_zeros, trailing_zeros, 1);
2653     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
2654     __ beq(first, ch2, L_SMALL_CMP_LOOP);
2655 
2656     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
2657     __ beqz(match_mask, NOMATCH);
2658     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2659     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2660     __ add(result, result, 1);
2661     __ add(haystack, haystack, haystack_chr_size);
2662     __ j(L_SMALL_HAS_ZERO_LOOP);
2663 
2664     __ align(OptoLoopAlignment);
2665     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
2666     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2667     __ j(DONE);
2668 
2669     __ align(OptoLoopAlignment);
2670     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
2671     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2672     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2673     __ j(DONE);
2674 
2675     __ align(OptoLoopAlignment);
2676     __ bind(L_HAS_ZERO);
2677     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2678     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2679     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
2680     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
2681     __ sub(result, result, 1); // array index from 0, so result -= 1
2682 
2683     __ bind(L_HAS_ZERO_LOOP);
2684     __ mv(needle_len, wordSize / haystack_chr_size);
2685     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
2686     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
2687     // load next 8 bytes from haystack, and increase result index
2688     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2689     __ add(result, result, 1);
2690     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2691     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2692 
2693     // compare one char
2694     __ bind(L_CMP_LOOP);
2695     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
2696     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
2697     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2698     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2699     __ add(trailing_zeros, trailing_zeros, 1); // next char index
2700     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
2701     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
2702     __ beq(needle_len, ch2, L_CMP_LOOP);
2703 
2704     __ bind(L_CMP_LOOP_NOMATCH);
2705     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
2706     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index
2707     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2708     __ add(haystack, haystack, haystack_chr_size);
2709     __ j(L_HAS_ZERO_LOOP);
2710 
2711     __ align(OptoLoopAlignment);
2712     __ bind(L_CMP_LOOP_LAST_CMP);
2713     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
2714     __ j(DONE);
2715 
2716     __ align(OptoLoopAlignment);
2717     __ bind(L_CMP_LOOP_LAST_CMP2);
2718     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2719     __ add(result, result, 1);
2720     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2721     __ j(DONE);
2722 
2723     __ align(OptoLoopAlignment);
2724     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
2725     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
2726     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
2727     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
2728     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
2729     // result by analyzed characters value, so, we can just reset lower bits
2730     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
2731     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
2732     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
2733     // index of last analyzed substring inside current octet. So, haystack in at
2734     // respective start address. We need to advance it to next octet
2735     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
2736     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
2737     __ andi(result, result, haystack_isL ? -8 : -4);
2738     __ slli(tmp, match_mask, haystack_chr_shift);
2739     __ sub(haystack, haystack, tmp);
2740     __ addw(haystack_len, haystack_len, zr);
2741     __ j(L_LOOP_PROCEED);
2742 
2743     __ align(OptoLoopAlignment);
2744     __ bind(NOMATCH);
2745     __ mv(result, -1);
2746 
2747     __ bind(DONE);
2748     __ pop_reg(spilled_regs, sp);
2749     __ ret();
2750     return entry;
2751   }
2752 
2753   void generate_string_indexof_stubs()
2754   {
2755     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
2756     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
2757     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
2758   }
2759 
2760 #ifdef COMPILER2
2761   address generate_mulAdd()
2762   {
2763     __ align(CodeEntryAlignment);
2764     StubCodeMark mark(this, "StubRoutines", "mulAdd");
2765 
2766     address entry = __ pc();
2767 
2768     const Register out     = x10;
2769     const Register in      = x11;
2770     const Register offset  = x12;
2771     const Register len     = x13;
2772     const Register k       = x14;
2773     const Register tmp     = x28;
2774 
2775     BLOCK_COMMENT("Entry:");
2776     __ enter();
2777     __ mul_add(out, in, offset, len, k, tmp);
2778     __ leave();
2779     __ ret();
2780 
2781     return entry;
2782   }
2783 
2784   /**
2785    *  Arguments:
2786    *
2787    *  Input:
2788    *    c_rarg0   - x address
2789    *    c_rarg1   - x length
2790    *    c_rarg2   - y address
2791    *    c_rarg3   - y length
2792    *    c_rarg4   - z address
2793    *    c_rarg5   - z length
2794    */
2795   address generate_multiplyToLen()
2796   {
2797     __ align(CodeEntryAlignment);
2798     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2799     address entry = __ pc();
2800 
2801     const Register x     = x10;
2802     const Register xlen  = x11;
2803     const Register y     = x12;
2804     const Register ylen  = x13;
2805     const Register z     = x14;
2806     const Register zlen  = x15;
2807 
2808     const Register tmp1  = x16;
2809     const Register tmp2  = x17;
2810     const Register tmp3  = x7;
2811     const Register tmp4  = x28;
2812     const Register tmp5  = x29;
2813     const Register tmp6  = x30;
2814     const Register tmp7  = x31;
2815 
2816     BLOCK_COMMENT("Entry:");
2817     __ enter(); // required for proper stackwalking of RuntimeStub frame
2818     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2819     __ leave(); // required for proper stackwalking of RuntimeStub frame
2820     __ ret();
2821 
2822     return entry;
2823   }
2824 
2825   address generate_squareToLen()
2826   {
2827     __ align(CodeEntryAlignment);
2828     StubCodeMark mark(this, "StubRoutines", "squareToLen");
2829     address entry = __ pc();
2830 
2831     const Register x     = x10;
2832     const Register xlen  = x11;
2833     const Register z     = x12;
2834     const Register zlen  = x13;
2835     const Register y     = x14; // == x
2836     const Register ylen  = x15; // == xlen
2837 
2838     const Register tmp1  = x16;
2839     const Register tmp2  = x17;
2840     const Register tmp3  = x7;
2841     const Register tmp4  = x28;
2842     const Register tmp5  = x29;
2843     const Register tmp6  = x30;
2844     const Register tmp7  = x31;
2845 
2846     BLOCK_COMMENT("Entry:");
2847     __ enter();
2848     __ mv(y, x);
2849     __ mv(ylen, xlen);
2850     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2851     __ leave();
2852     __ ret();
2853 
2854     return entry;
2855   }
2856 
2857   // Arguments:
2858   //
2859   // Input:
2860   //   c_rarg0   - newArr address
2861   //   c_rarg1   - oldArr address
2862   //   c_rarg2   - newIdx
2863   //   c_rarg3   - shiftCount
2864   //   c_rarg4   - numIter
2865   //
2866   address generate_bigIntegerLeftShift() {
2867     __ align(CodeEntryAlignment);
2868     StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
2869     address entry = __ pc();
2870 
2871     Label loop, exit;
2872 
2873     Register newArr        = c_rarg0;
2874     Register oldArr        = c_rarg1;
2875     Register newIdx        = c_rarg2;
2876     Register shiftCount    = c_rarg3;
2877     Register numIter       = c_rarg4;
2878 
2879     Register shiftRevCount = c_rarg5;
2880     Register oldArrNext    = t1;
2881 
2882     __ beqz(numIter, exit);
2883     __ shadd(newArr, newIdx, newArr, t0, 2);
2884 
2885     __ li(shiftRevCount, 32);
2886     __ sub(shiftRevCount, shiftRevCount, shiftCount);
2887 
2888     __ bind(loop);
2889     __ addi(oldArrNext, oldArr, 4);
2890     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
2891     __ vle32_v(v0, oldArr);
2892     __ vle32_v(v4, oldArrNext);
2893     __ vsll_vx(v0, v0, shiftCount);
2894     __ vsrl_vx(v4, v4, shiftRevCount);
2895     __ vor_vv(v0, v0, v4);
2896     __ vse32_v(v0, newArr);
2897     __ sub(numIter, numIter, t0);
2898     __ shadd(oldArr, t0, oldArr, t1, 2);
2899     __ shadd(newArr, t0, newArr, t1, 2);
2900     __ bnez(numIter, loop);
2901 
2902     __ bind(exit);
2903     __ ret();
2904 
2905     return entry;
2906   }
2907 
2908   // Arguments:
2909   //
2910   // Input:
2911   //   c_rarg0   - newArr address
2912   //   c_rarg1   - oldArr address
2913   //   c_rarg2   - newIdx
2914   //   c_rarg3   - shiftCount
2915   //   c_rarg4   - numIter
2916   //
2917   address generate_bigIntegerRightShift() {
2918     __ align(CodeEntryAlignment);
2919     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
2920     address entry = __ pc();
2921 
2922     Label loop, exit;
2923 
2924     Register newArr        = c_rarg0;
2925     Register oldArr        = c_rarg1;
2926     Register newIdx        = c_rarg2;
2927     Register shiftCount    = c_rarg3;
2928     Register numIter       = c_rarg4;
2929     Register idx           = numIter;
2930 
2931     Register shiftRevCount = c_rarg5;
2932     Register oldArrNext    = c_rarg6;
2933     Register newArrCur     = t0;
2934     Register oldArrCur     = t1;
2935 
2936     __ beqz(idx, exit);
2937     __ shadd(newArr, newIdx, newArr, t0, 2);
2938 
2939     __ li(shiftRevCount, 32);
2940     __ sub(shiftRevCount, shiftRevCount, shiftCount);
2941 
2942     __ bind(loop);
2943     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
2944     __ sub(idx, idx, t0);
2945     __ shadd(oldArrNext, idx, oldArr, t1, 2);
2946     __ shadd(newArrCur, idx, newArr, t1, 2);
2947     __ addi(oldArrCur, oldArrNext, 4);
2948     __ vle32_v(v0, oldArrCur);
2949     __ vle32_v(v4, oldArrNext);
2950     __ vsrl_vx(v0, v0, shiftCount);
2951     __ vsll_vx(v4, v4, shiftRevCount);
2952     __ vor_vv(v0, v0, v4);
2953     __ vse32_v(v0, newArrCur);
2954     __ bnez(idx, loop);
2955 
2956     __ bind(exit);
2957     __ ret();
2958 
2959     return entry;
2960   }
2961 #endif
2962 
2963 #ifdef COMPILER2
2964   class MontgomeryMultiplyGenerator : public MacroAssembler {
2965 
2966     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
2967       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
2968 
2969     RegSet _toSave;
2970     bool _squaring;
2971 
2972   public:
2973     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
2974       : MacroAssembler(as->code()), _squaring(squaring) {
2975 
2976       // Register allocation
2977 
2978       Register reg = c_rarg0;
2979       Pa_base = reg;       // Argument registers
2980       if (squaring) {
2981         Pb_base = Pa_base;
2982       } else {
2983         Pb_base = ++reg;
2984       }
2985       Pn_base = ++reg;
2986       Rlen= ++reg;
2987       inv = ++reg;
2988       Pm_base = ++reg;
2989 
2990                         // Working registers:
2991       Ra =  ++reg;      // The current digit of a, b, n, and m.
2992       Rb =  ++reg;
2993       Rm =  ++reg;
2994       Rn =  ++reg;
2995 
2996       Pa =  ++reg;      // Pointers to the current/next digit of a, b, n, and m.
2997       Pb =  ++reg;
2998       Pm =  ++reg;
2999       Pn =  ++reg;
3000 
3001       tmp0 =  ++reg;    // Three registers which form a
3002       tmp1 =  ++reg;    // triple-precision accumuator.
3003       tmp2 =  ++reg;
3004 
3005       Ri =  x6;         // Inner and outer loop indexes.
3006       Rj =  x7;
3007 
3008       Rhi_ab = x28;     // Product registers: low and high parts
3009       Rlo_ab = x29;     // of a*b and m*n.
3010       Rhi_mn = x30;
3011       Rlo_mn = x31;
3012 
3013       // x18 and up are callee-saved.
3014       _toSave = RegSet::range(x18, reg) + Pm_base;
3015     }
3016 
3017   private:
3018     void save_regs() {
3019       push_reg(_toSave, sp);
3020     }
3021 
3022     void restore_regs() {
3023       pop_reg(_toSave, sp);
3024     }
3025 
3026     template <typename T>
3027     void unroll_2(Register count, T block) {
3028       Label loop, end, odd;
3029       beqz(count, end);
3030       andi(t0, count, 0x1);
3031       bnez(t0, odd);
3032       align(16);
3033       bind(loop);
3034       (this->*block)();
3035       bind(odd);
3036       (this->*block)();
3037       addi(count, count, -2);
3038       bgtz(count, loop);
3039       bind(end);
3040     }
3041 
3042     template <typename T>
3043     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3044       Label loop, end, odd;
3045       beqz(count, end);
3046       andi(tmp, count, 0x1);
3047       bnez(tmp, odd);
3048       align(16);
3049       bind(loop);
3050       (this->*block)(d, s, tmp);
3051       bind(odd);
3052       (this->*block)(d, s, tmp);
3053       addi(count, count, -2);
3054       bgtz(count, loop);
3055       bind(end);
3056     }
3057 
3058     void pre1(RegisterOrConstant i) {
3059       block_comment("pre1");
3060       // Pa = Pa_base;
3061       // Pb = Pb_base + i;
3062       // Pm = Pm_base;
3063       // Pn = Pn_base + i;
3064       // Ra = *Pa;
3065       // Rb = *Pb;
3066       // Rm = *Pm;
3067       // Rn = *Pn;
3068       if (i.is_register()) {
3069         slli(t0, i.as_register(), LogBytesPerWord);
3070       } else {
3071         mv(t0, i.as_constant());
3072         slli(t0, t0, LogBytesPerWord);
3073       }
3074 
3075       mv(Pa, Pa_base);
3076       add(Pb, Pb_base, t0);
3077       mv(Pm, Pm_base);
3078       add(Pn, Pn_base, t0);
3079 
3080       ld(Ra, Address(Pa));
3081       ld(Rb, Address(Pb));
3082       ld(Rm, Address(Pm));
3083       ld(Rn, Address(Pn));
3084 
3085       // Zero the m*n result.
3086       mv(Rhi_mn, zr);
3087       mv(Rlo_mn, zr);
3088     }
3089 
3090     // The core multiply-accumulate step of a Montgomery
3091     // multiplication.  The idea is to schedule operations as a
3092     // pipeline so that instructions with long latencies (loads and
3093     // multiplies) have time to complete before their results are
3094     // used.  This most benefits in-order implementations of the
3095     // architecture but out-of-order ones also benefit.
3096     void step() {
3097       block_comment("step");
3098       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3099       // Ra = *++Pa;
3100       // Rb = *--Pb;
3101       mulhu(Rhi_ab, Ra, Rb);
3102       mul(Rlo_ab, Ra, Rb);
3103       addi(Pa, Pa, wordSize);
3104       ld(Ra, Address(Pa));
3105       addi(Pb, Pb, -wordSize);
3106       ld(Rb, Address(Pb));
3107       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
3108                                             // previous iteration.
3109       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3110       // Rm = *++Pm;
3111       // Rn = *--Pn;
3112       mulhu(Rhi_mn, Rm, Rn);
3113       mul(Rlo_mn, Rm, Rn);
3114       addi(Pm, Pm, wordSize);
3115       ld(Rm, Address(Pm));
3116       addi(Pn, Pn, -wordSize);
3117       ld(Rn, Address(Pn));
3118       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3119     }
3120 
3121     void post1() {
3122       block_comment("post1");
3123 
3124       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3125       // Ra = *++Pa;
3126       // Rb = *--Pb;
3127       mulhu(Rhi_ab, Ra, Rb);
3128       mul(Rlo_ab, Ra, Rb);
3129       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3130       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3131 
3132       // *Pm = Rm = tmp0 * inv;
3133       mul(Rm, tmp0, inv);
3134       sd(Rm, Address(Pm));
3135 
3136       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3137       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3138       mulhu(Rhi_mn, Rm, Rn);
3139 
3140 #ifndef PRODUCT
3141       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3142       {
3143         mul(Rlo_mn, Rm, Rn);
3144         add(Rlo_mn, tmp0, Rlo_mn);
3145         Label ok;
3146         beqz(Rlo_mn, ok);
3147         stop("broken Montgomery multiply");
3148         bind(ok);
3149       }
3150 #endif
3151       // We have very carefully set things up so that
3152       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3153       // the lower half of Rm * Rn because we know the result already:
3154       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3155       // tmp0 != 0.  So, rather than do a mul and an cad we just set
3156       // the carry flag iff tmp0 is nonzero.
3157       //
3158       // mul(Rlo_mn, Rm, Rn);
3159       // cad(zr, tmp0, Rlo_mn);
3160       addi(t0, tmp0, -1);
3161       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3162       cadc(tmp0, tmp1, Rhi_mn, t0);
3163       adc(tmp1, tmp2, zr, t0);
3164       mv(tmp2, zr);
3165     }
3166 
3167     void pre2(Register i, Register len) {
3168       block_comment("pre2");
3169       // Pa = Pa_base + i-len;
3170       // Pb = Pb_base + len;
3171       // Pm = Pm_base + i-len;
3172       // Pn = Pn_base + len;
3173 
3174       sub(Rj, i, len);
3175       // Rj == i-len
3176 
3177       // Ra as temp register
3178       slli(Ra, Rj, LogBytesPerWord);
3179       add(Pa, Pa_base, Ra);
3180       add(Pm, Pm_base, Ra);
3181       slli(Ra, len, LogBytesPerWord);
3182       add(Pb, Pb_base, Ra);
3183       add(Pn, Pn_base, Ra);
3184 
3185       // Ra = *++Pa;
3186       // Rb = *--Pb;
3187       // Rm = *++Pm;
3188       // Rn = *--Pn;
3189       add(Pa, Pa, wordSize);
3190       ld(Ra, Address(Pa));
3191       add(Pb, Pb, -wordSize);
3192       ld(Rb, Address(Pb));
3193       add(Pm, Pm, wordSize);
3194       ld(Rm, Address(Pm));
3195       add(Pn, Pn, -wordSize);
3196       ld(Rn, Address(Pn));
3197 
3198       mv(Rhi_mn, zr);
3199       mv(Rlo_mn, zr);
3200     }
3201 
3202     void post2(Register i, Register len) {
3203       block_comment("post2");
3204       sub(Rj, i, len);
3205 
3206       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
3207 
3208       // As soon as we know the least significant digit of our result,
3209       // store it.
3210       // Pm_base[i-len] = tmp0;
3211       // Rj as temp register
3212       slli(Rj, Rj, LogBytesPerWord);
3213       add(Rj, Pm_base, Rj);
3214       sd(tmp0, Address(Rj));
3215 
3216       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3217       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
3218       adc(tmp1, tmp2, zr, t0);
3219       mv(tmp2, zr);
3220     }
3221 
3222     // A carry in tmp0 after Montgomery multiplication means that we
3223     // should subtract multiples of n from our result in m.  We'll
3224     // keep doing that until there is no carry.
3225     void normalize(Register len) {
3226       block_comment("normalize");
3227       // while (tmp0)
3228       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
3229       Label loop, post, again;
3230       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
3231       beqz(tmp0, post); {
3232         bind(again); {
3233           mv(i, zr);
3234           mv(cnt, len);
3235           slli(Rn, i, LogBytesPerWord);
3236           add(Rm, Pm_base, Rn);
3237           ld(Rm, Address(Rm));
3238           add(Rn, Pn_base, Rn);
3239           ld(Rn, Address(Rn));
3240           li(t0, 1); // set carry flag, i.e. no borrow
3241           align(16);
3242           bind(loop); {
3243             notr(Rn, Rn);
3244             add(Rm, Rm, t0);
3245             add(Rm, Rm, Rn);
3246             sltu(t0, Rm, Rn);
3247             slli(Rn, i, LogBytesPerWord); // Rn as temp register
3248             add(Rn, Pm_base, Rn);
3249             sd(Rm, Address(Rn));
3250             add(i, i, 1);
3251             slli(Rn, i, LogBytesPerWord);
3252             add(Rm, Pm_base, Rn);
3253             ld(Rm, Address(Rm));
3254             add(Rn, Pn_base, Rn);
3255             ld(Rn, Address(Rn));
3256             sub(cnt, cnt, 1);
3257           } bnez(cnt, loop);
3258           addi(tmp0, tmp0, -1);
3259           add(tmp0, tmp0, t0);
3260         } bnez(tmp0, again);
3261       } bind(post);
3262     }
3263 
3264     // Move memory at s to d, reversing words.
3265     //    Increments d to end of copied memory
3266     //    Destroys tmp1, tmp2
3267     //    Preserves len
3268     //    Leaves s pointing to the address which was in d at start
3269     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3270       assert(tmp1 < x28 && tmp2 < x28, "register corruption");
3271 
3272       slli(tmp1, len, LogBytesPerWord);
3273       add(s, s, tmp1);
3274       mv(tmp1, len);
3275       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3276       slli(tmp1, len, LogBytesPerWord);
3277       sub(s, d, tmp1);
3278     }
3279     // [63...0] -> [31...0][63...32]
3280     void reverse1(Register d, Register s, Register tmp) {
3281       addi(s, s, -wordSize);
3282       ld(tmp, Address(s));
3283       ror_imm(tmp, tmp, 32, t0);
3284       sd(tmp, Address(d));
3285       addi(d, d, wordSize);
3286     }
3287 
3288     void step_squaring() {
3289       // An extra ACC
3290       step();
3291       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3292     }
3293 
3294     void last_squaring(Register i) {
3295       Label dont;
3296       // if ((i & 1) == 0) {
3297       andi(t0, i, 0x1);
3298       bnez(t0, dont); {
3299         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3300         // Ra = *++Pa;
3301         // Rb = *--Pb;
3302         mulhu(Rhi_ab, Ra, Rb);
3303         mul(Rlo_ab, Ra, Rb);
3304         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3305       } bind(dont);
3306     }
3307 
3308     void extra_step_squaring() {
3309       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3310 
3311       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3312       // Rm = *++Pm;
3313       // Rn = *--Pn;
3314       mulhu(Rhi_mn, Rm, Rn);
3315       mul(Rlo_mn, Rm, Rn);
3316       addi(Pm, Pm, wordSize);
3317       ld(Rm, Address(Pm));
3318       addi(Pn, Pn, -wordSize);
3319       ld(Rn, Address(Pn));
3320     }
3321 
3322     void post1_squaring() {
3323       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3324 
3325       // *Pm = Rm = tmp0 * inv;
3326       mul(Rm, tmp0, inv);
3327       sd(Rm, Address(Pm));
3328 
3329       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3330       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3331       mulhu(Rhi_mn, Rm, Rn);
3332 
3333 #ifndef PRODUCT
3334       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3335       {
3336         mul(Rlo_mn, Rm, Rn);
3337         add(Rlo_mn, tmp0, Rlo_mn);
3338         Label ok;
3339         beqz(Rlo_mn, ok); {
3340           stop("broken Montgomery multiply");
3341         } bind(ok);
3342       }
3343 #endif
3344       // We have very carefully set things up so that
3345       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3346       // the lower half of Rm * Rn because we know the result already:
3347       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3348       // tmp0 != 0.  So, rather than do a mul and a cad we just set
3349       // the carry flag iff tmp0 is nonzero.
3350       //
3351       // mul(Rlo_mn, Rm, Rn);
3352       // cad(zr, tmp, Rlo_mn);
3353       addi(t0, tmp0, -1);
3354       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3355       cadc(tmp0, tmp1, Rhi_mn, t0);
3356       adc(tmp1, tmp2, zr, t0);
3357       mv(tmp2, zr);
3358     }
3359 
3360     // use t0 as carry
3361     void acc(Register Rhi, Register Rlo,
3362              Register tmp0, Register tmp1, Register tmp2) {
3363       cad(tmp0, tmp0, Rlo, t0);
3364       cadc(tmp1, tmp1, Rhi, t0);
3365       adc(tmp2, tmp2, zr, t0);
3366     }
3367 
3368   public:
3369     /**
3370      * Fast Montgomery multiplication.  The derivation of the
3371      * algorithm is in A Cryptographic Library for the Motorola
3372      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3373      *
3374      * Arguments:
3375      *
3376      * Inputs for multiplication:
3377      *   c_rarg0   - int array elements a
3378      *   c_rarg1   - int array elements b
3379      *   c_rarg2   - int array elements n (the modulus)
3380      *   c_rarg3   - int length
3381      *   c_rarg4   - int inv
3382      *   c_rarg5   - int array elements m (the result)
3383      *
3384      * Inputs for squaring:
3385      *   c_rarg0   - int array elements a
3386      *   c_rarg1   - int array elements n (the modulus)
3387      *   c_rarg2   - int length
3388      *   c_rarg3   - int inv
3389      *   c_rarg4   - int array elements m (the result)
3390      *
3391      */
3392     address generate_multiply() {
3393       Label argh, nothing;
3394       bind(argh);
3395       stop("MontgomeryMultiply total_allocation must be <= 8192");
3396 
3397       align(CodeEntryAlignment);
3398       address entry = pc();
3399 
3400       beqz(Rlen, nothing);
3401 
3402       enter();
3403 
3404       // Make room.
3405       li(Ra, 512);
3406       bgt(Rlen, Ra, argh);
3407       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3408       sub(Ra, sp, Ra);
3409       andi(sp, Ra, -2 * wordSize);
3410 
3411       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3412 
3413       {
3414         // Copy input args, reversing as we go.  We use Ra as a
3415         // temporary variable.
3416         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3417         if (!_squaring)
3418           reverse(Ra, Pb_base, Rlen, Ri, Rj);
3419         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3420       }
3421 
3422       // Push all call-saved registers and also Pm_base which we'll need
3423       // at the end.
3424       save_regs();
3425 
3426 #ifndef PRODUCT
3427       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3428       {
3429         ld(Rn, Address(Pn_base));
3430         mul(Rlo_mn, Rn, inv);
3431         li(t0, -1);
3432         Label ok;
3433         beq(Rlo_mn, t0, ok);
3434         stop("broken inverse in Montgomery multiply");
3435         bind(ok);
3436       }
3437 #endif
3438 
3439       mv(Pm_base, Ra);
3440 
3441       mv(tmp0, zr);
3442       mv(tmp1, zr);
3443       mv(tmp2, zr);
3444 
3445       block_comment("for (int i = 0; i < len; i++) {");
3446       mv(Ri, zr); {
3447         Label loop, end;
3448         bge(Ri, Rlen, end);
3449 
3450         bind(loop);
3451         pre1(Ri);
3452 
3453         block_comment("  for (j = i; j; j--) {"); {
3454           mv(Rj, Ri);
3455           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3456         } block_comment("  } // j");
3457 
3458         post1();
3459         addw(Ri, Ri, 1);
3460         blt(Ri, Rlen, loop);
3461         bind(end);
3462         block_comment("} // i");
3463       }
3464 
3465       block_comment("for (int i = len; i < 2*len; i++) {");
3466       mv(Ri, Rlen); {
3467         Label loop, end;
3468         slli(t0, Rlen, 1);
3469         bge(Ri, t0, end);
3470 
3471         bind(loop);
3472         pre2(Ri, Rlen);
3473 
3474         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3475           slliw(Rj, Rlen, 1);
3476           subw(Rj, Rj, Ri);
3477           subw(Rj, Rj, 1);
3478           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3479         } block_comment("  } // j");
3480 
3481         post2(Ri, Rlen);
3482         addw(Ri, Ri, 1);
3483         slli(t0, Rlen, 1);
3484         blt(Ri, t0, loop);
3485         bind(end);
3486       }
3487       block_comment("} // i");
3488 
3489       normalize(Rlen);
3490 
3491       mv(Ra, Pm_base);  // Save Pm_base in Ra
3492       restore_regs();  // Restore caller's Pm_base
3493 
3494       // Copy our result into caller's Pm_base
3495       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3496 
3497       leave();
3498       bind(nothing);
3499       ret();
3500 
3501       return entry;
3502     }
3503 
3504     /**
3505      *
3506      * Arguments:
3507      *
3508      * Inputs:
3509      *   c_rarg0   - int array elements a
3510      *   c_rarg1   - int array elements n (the modulus)
3511      *   c_rarg2   - int length
3512      *   c_rarg3   - int inv
3513      *   c_rarg4   - int array elements m (the result)
3514      *
3515      */
3516     address generate_square() {
3517       Label argh;
3518       bind(argh);
3519       stop("MontgomeryMultiply total_allocation must be <= 8192");
3520 
3521       align(CodeEntryAlignment);
3522       address entry = pc();
3523 
3524       enter();
3525 
3526       // Make room.
3527       li(Ra, 512);
3528       bgt(Rlen, Ra, argh);
3529       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3530       sub(Ra, sp, Ra);
3531       andi(sp, Ra, -2 * wordSize);
3532 
3533       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3534 
3535       {
3536         // Copy input args, reversing as we go.  We use Ra as a
3537         // temporary variable.
3538         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3539         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3540       }
3541 
3542       // Push all call-saved registers and also Pm_base which we'll need
3543       // at the end.
3544       save_regs();
3545 
3546       mv(Pm_base, Ra);
3547 
3548       mv(tmp0, zr);
3549       mv(tmp1, zr);
3550       mv(tmp2, zr);
3551 
3552       block_comment("for (int i = 0; i < len; i++) {");
3553       mv(Ri, zr); {
3554         Label loop, end;
3555         bind(loop);
3556         bge(Ri, Rlen, end);
3557 
3558         pre1(Ri);
3559 
3560         block_comment("for (j = (i+1)/2; j; j--) {"); {
3561           addi(Rj, Ri, 1);
3562           srliw(Rj, Rj, 1);
3563           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3564         } block_comment("  } // j");
3565 
3566         last_squaring(Ri);
3567 
3568         block_comment("  for (j = i/2; j; j--) {"); {
3569           srliw(Rj, Ri, 1);
3570           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3571         } block_comment("  } // j");
3572 
3573         post1_squaring();
3574         addi(Ri, Ri, 1);
3575         blt(Ri, Rlen, loop);
3576 
3577         bind(end);
3578         block_comment("} // i");
3579       }
3580 
3581       block_comment("for (int i = len; i < 2*len; i++) {");
3582       mv(Ri, Rlen); {
3583         Label loop, end;
3584         bind(loop);
3585         slli(t0, Rlen, 1);
3586         bge(Ri, t0, end);
3587 
3588         pre2(Ri, Rlen);
3589 
3590         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
3591           slli(Rj, Rlen, 1);
3592           sub(Rj, Rj, Ri);
3593           sub(Rj, Rj, 1);
3594           srliw(Rj, Rj, 1);
3595           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3596         } block_comment("  } // j");
3597 
3598         last_squaring(Ri);
3599 
3600         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
3601           slli(Rj, Rlen, 1);
3602           sub(Rj, Rj, Ri);
3603           srliw(Rj, Rj, 1);
3604           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3605         } block_comment("  } // j");
3606 
3607         post2(Ri, Rlen);
3608         addi(Ri, Ri, 1);
3609         slli(t0, Rlen, 1);
3610         blt(Ri, t0, loop);
3611 
3612         bind(end);
3613         block_comment("} // i");
3614       }
3615 
3616       normalize(Rlen);
3617 
3618       mv(Ra, Pm_base);  // Save Pm_base in Ra
3619       restore_regs();  // Restore caller's Pm_base
3620 
3621       // Copy our result into caller's Pm_base
3622       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3623 
3624       leave();
3625       ret();
3626 
3627       return entry;
3628     }
3629   };
3630 #endif // COMPILER2
3631 
3632   // Continuation point for throwing of implicit exceptions that are
3633   // not handled in the current activation. Fabricates an exception
3634   // oop and initiates normal exception dispatching in this
3635   // frame. Since we need to preserve callee-saved values (currently
3636   // only for C2, but done for C1 as well) we need a callee-saved oop
3637   // map and therefore have to make these stubs into RuntimeStubs
3638   // rather than BufferBlobs.  If the compiler needs all registers to
3639   // be preserved between the fault point and the exception handler
3640   // then it must assume responsibility for that in
3641   // AbstractCompiler::continuation_for_implicit_null_exception or
3642   // continuation_for_implicit_division_by_zero_exception. All other
3643   // implicit exceptions (e.g., NullPointerException or
3644   // AbstractMethodError on entry) are either at call sites or
3645   // otherwise assume that stack unwinding will be initiated, so
3646   // caller saved registers were assumed volatile in the compiler.
3647 
3648 #undef __
3649 #define __ masm->
3650 
3651   address generate_throw_exception(const char* name,
3652                                    address runtime_entry,
3653                                    Register arg1 = noreg,
3654                                    Register arg2 = noreg) {
3655     // Information about frame layout at time of blocking runtime call.
3656     // Note that we only have to preserve callee-saved registers since
3657     // the compilers are responsible for supplying a continuation point
3658     // if they expect all registers to be preserved.
3659     // n.b. riscv asserts that frame::arg_reg_save_area_bytes == 0
3660     assert_cond(runtime_entry != NULL);
3661     enum layout {
3662       fp_off = 0,
3663       fp_off2,
3664       return_off,
3665       return_off2,
3666       framesize // inclusive of return address
3667     };
3668 
3669     const int insts_size = 512;
3670     const int locs_size  = 64;
3671 
3672     CodeBuffer code(name, insts_size, locs_size);
3673     OopMapSet* oop_maps  = new OopMapSet();
3674     MacroAssembler* masm = new MacroAssembler(&code);
3675     assert_cond(oop_maps != NULL && masm != NULL);
3676 
3677     address start = __ pc();
3678 
3679     // This is an inlined and slightly modified version of call_VM
3680     // which has the ability to fetch the return PC out of
3681     // thread-local storage and also sets up last_Java_sp slightly
3682     // differently than the real call_VM
3683 
3684     __ enter(); // Save FP and RA before call
3685 
3686     assert(is_even(framesize / 2), "sp not 16-byte aligned");
3687 
3688     // ra and fp are already in place
3689     __ addi(sp, fp, 0 - ((unsigned)framesize << LogBytesPerInt)); // prolog
3690 
3691     int frame_complete = __ pc() - start;
3692 
3693     // Set up last_Java_sp and last_Java_fp
3694     address the_pc = __ pc();
3695     __ set_last_Java_frame(sp, fp, the_pc, t0);
3696 
3697     // Call runtime
3698     if (arg1 != noreg) {
3699       assert(arg2 != c_rarg1, "clobbered");
3700       __ mv(c_rarg1, arg1);
3701     }
3702     if (arg2 != noreg) {
3703       __ mv(c_rarg2, arg2);
3704     }
3705     __ mv(c_rarg0, xthread);
3706     BLOCK_COMMENT("call runtime_entry");
3707     int32_t offset = 0;
3708     __ movptr_with_offset(t0, runtime_entry, offset);
3709     __ jalr(x1, t0, offset);
3710 
3711     // Generate oop map
3712     OopMap* map = new OopMap(framesize, 0);
3713     assert_cond(map != NULL);
3714 
3715     oop_maps->add_gc_map(the_pc - start, map);
3716 
3717     __ reset_last_Java_frame(true);
3718 
3719     __ leave();
3720 
3721     // check for pending exceptions
3722 #ifdef ASSERT
3723     Label L;
3724     __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
3725     __ bnez(t0, L);
3726     __ should_not_reach_here();
3727     __ bind(L);
3728 #endif // ASSERT
3729     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3730 
3731 
3732     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3733     RuntimeStub* stub =
3734       RuntimeStub::new_runtime_stub(name,
3735                                     &code,
3736                                     frame_complete,
3737                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3738                                     oop_maps, false);
3739     assert(stub != NULL, "create runtime stub fail!");
3740     return stub->entry_point();
3741   }
3742 
3743   // Initialization
3744   void generate_initial() {
3745     // Generate initial stubs and initializes the entry points
3746 
3747     // entry points that exist in all platforms Note: This is code
3748     // that could be shared among different platforms - however the
3749     // benefit seems to be smaller than the disadvantage of having a
3750     // much more complicated generator structure. See also comment in
3751     // stubRoutines.hpp.
3752 
3753     StubRoutines::_forward_exception_entry = generate_forward_exception();
3754 
3755     StubRoutines::_call_stub_entry =
3756       generate_call_stub(StubRoutines::_call_stub_return_address);
3757 
3758     // is referenced by megamorphic call
3759     StubRoutines::_catch_exception_entry = generate_catch_exception();
3760 
3761     // Build this early so it's available for the interpreter.
3762     StubRoutines::_throw_StackOverflowError_entry =
3763       generate_throw_exception("StackOverflowError throw_exception",
3764                                CAST_FROM_FN_PTR(address,
3765                                                 SharedRuntime::throw_StackOverflowError));
3766     StubRoutines::_throw_delayed_StackOverflowError_entry =
3767       generate_throw_exception("delayed StackOverflowError throw_exception",
3768                                CAST_FROM_FN_PTR(address,
3769                                                 SharedRuntime::throw_delayed_StackOverflowError));
3770     // Safefetch stubs.
3771     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
3772                                                        &StubRoutines::_safefetch32_fault_pc,
3773                                                        &StubRoutines::_safefetch32_continuation_pc);
3774     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
3775                                                        &StubRoutines::_safefetchN_fault_pc,
3776                                                        &StubRoutines::_safefetchN_continuation_pc);
3777   }
3778 
3779   void generate_all() {
3780     // support for verify_oop (must happen after universe_init)
3781     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
3782     StubRoutines::_throw_AbstractMethodError_entry =
3783       generate_throw_exception("AbstractMethodError throw_exception",
3784                                CAST_FROM_FN_PTR(address,
3785                                                 SharedRuntime::
3786                                                 throw_AbstractMethodError));
3787 
3788     StubRoutines::_throw_IncompatibleClassChangeError_entry =
3789       generate_throw_exception("IncompatibleClassChangeError throw_exception",
3790                                CAST_FROM_FN_PTR(address,
3791                                                 SharedRuntime::
3792                                                 throw_IncompatibleClassChangeError));
3793 
3794     StubRoutines::_throw_NullPointerException_at_call_entry =
3795       generate_throw_exception("NullPointerException at call throw_exception",
3796                                CAST_FROM_FN_PTR(address,
3797                                                 SharedRuntime::
3798                                                 throw_NullPointerException_at_call));
3799     // arraycopy stubs used by compilers
3800     generate_arraycopy_stubs();
3801 
3802 #ifdef COMPILER2
3803     if (UseMulAddIntrinsic) {
3804       StubRoutines::_mulAdd = generate_mulAdd();
3805     }
3806 
3807     if (UseMultiplyToLenIntrinsic) {
3808       StubRoutines::_multiplyToLen = generate_multiplyToLen();
3809     }
3810 
3811     if (UseSquareToLenIntrinsic) {
3812       StubRoutines::_squareToLen = generate_squareToLen();
3813     }
3814 
3815     if (UseMontgomeryMultiplyIntrinsic) {
3816       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
3817       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
3818       StubRoutines::_montgomeryMultiply = g.generate_multiply();
3819     }
3820 
3821     if (UseMontgomerySquareIntrinsic) {
3822       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
3823       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
3824       StubRoutines::_montgomerySquare = g.generate_square();
3825     }
3826 
3827     if (UseRVVForBigIntegerShiftIntrinsics) {
3828       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
3829       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
3830     }
3831 #endif
3832 
3833     generate_compare_long_strings();
3834 
3835     generate_string_indexof_stubs();
3836 
3837     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3838     if (bs_nm != NULL) {
3839       StubRoutines::riscv::_method_entry_barrier = generate_method_entry_barrier();
3840     }
3841 
3842     StubRoutines::riscv::set_completed();
3843   }
3844 
3845  public:
3846   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3847     if (all) {
3848       generate_all();
3849     } else {
3850       generate_initial();
3851     }
3852   }
3853 
3854   ~StubGenerator() {}
3855 }; // end class declaration
3856 
3857 #define UCM_TABLE_MAX_ENTRIES 8
3858 void StubGenerator_generate(CodeBuffer* code, bool all) {
3859   if (UnsafeCopyMemory::_table == NULL) {
3860     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
3861   }
3862 
3863   StubGenerator g(code, all);
3864 }