1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/universe.hpp"
  35 #include "nativeInst_riscv.hpp"
  36 #include "oops/instanceOop.hpp"
  37 #include "oops/method.hpp"
  38 #include "oops/objArrayKlass.hpp"
  39 #include "oops/oop.inline.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "prims/upcallLinker.hpp"
  42 #include "runtime/continuation.hpp"
  43 #include "runtime/continuationEntry.inline.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/javaThread.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/runtime.hpp"
  54 #endif
  55 
  56 // Declaration and definition of StubGenerator (no .hpp file).
  57 // For a more detailed description of the stub routine structure
  58 // see the comment in stubRoutines.hpp
  59 
  60 #undef __
  61 #define __ _masm->
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(uint& counter) {
  80     __ incrementw(ExternalAddress((address)&counter));
  81   }
  82 #define inc_counter_np(counter) \
  83   BLOCK_COMMENT("inc_counter " #counter); \
  84   inc_counter_np_(counter);
  85 #endif
  86 
  87   // Call stubs are used to call Java from C
  88   //
  89   // Arguments:
  90   //    c_rarg0:   call wrapper address                   address
  91   //    c_rarg1:   result                                 address
  92   //    c_rarg2:   result type                            BasicType
  93   //    c_rarg3:   method                                 Method*
  94   //    c_rarg4:   (interpreter) entry point              address
  95   //    c_rarg5:   parameters                             intptr_t*
  96   //    c_rarg6:   parameter size (in words)              int
  97   //    c_rarg7:   thread                                 Thread*
  98   //
  99   // There is no return from the stub itself as any Java result
 100   // is written to result
 101   //
 102   // we save x1 (ra) as the return PC at the base of the frame and
 103   // link x8 (fp) below it as the frame pointer installing sp (x2)
 104   // into fp.
 105   //
 106   // we save x10-x17, which accounts for all the c arguments.
 107   //
 108   // TODO: strictly do we need to save them all? they are treated as
 109   // volatile by C so could we omit saving the ones we are going to
 110   // place in global registers (thread? method?) or those we only use
 111   // during setup of the Java call?
 112   //
 113   // we don't need to save x5 which C uses as an indirect result location
 114   // return register.
 115   //
 116   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 117   // volatile
 118   //
 119   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 120   // registers and C expects to be callee-save
 121   //
 122   // so the stub frame looks like this when we enter Java code
 123   //
 124   //     [ return_from_Java     ] <--- sp
 125   //     [ argument word n      ]
 126   //      ...
 127   // -35 [ argument word 1      ]
 128   // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
 129   // -33 [ saved f27            ]
 130   // -32 [ saved f26            ]
 131   // -31 [ saved f25            ]
 132   // -30 [ saved f24            ]
 133   // -29 [ saved f23            ]
 134   // -28 [ saved f22            ]
 135   // -27 [ saved f21            ]
 136   // -26 [ saved f20            ]
 137   // -25 [ saved f19            ]
 138   // -24 [ saved f18            ]
 139   // -23 [ saved f9             ]
 140   // -22 [ saved f8             ]
 141   // -21 [ saved x27            ]
 142   // -20 [ saved x26            ]
 143   // -19 [ saved x25            ]
 144   // -18 [ saved x24            ]
 145   // -17 [ saved x23            ]
 146   // -16 [ saved x22            ]
 147   // -15 [ saved x21            ]
 148   // -14 [ saved x20            ]
 149   // -13 [ saved x19            ]
 150   // -12 [ saved x18            ]
 151   // -11 [ saved x9             ]
 152   // -10 [ call wrapper   (x10) ]
 153   //  -9 [ result         (x11) ]
 154   //  -8 [ result type    (x12) ]
 155   //  -7 [ method         (x13) ]
 156   //  -6 [ entry point    (x14) ]
 157   //  -5 [ parameters     (x15) ]
 158   //  -4 [ parameter size (x16) ]
 159   //  -3 [ thread         (x17) ]
 160   //  -2 [ saved fp       (x8)  ]
 161   //  -1 [ saved ra       (x1)  ]
 162   //   0 [                      ] <--- fp == saved sp (x2)
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off  = -34,
 167 
 168     frm_off            = sp_after_call_off,
 169     f27_off            = -33,
 170     f26_off            = -32,
 171     f25_off            = -31,
 172     f24_off            = -30,
 173     f23_off            = -29,
 174     f22_off            = -28,
 175     f21_off            = -27,
 176     f20_off            = -26,
 177     f19_off            = -25,
 178     f18_off            = -24,
 179     f9_off             = -23,
 180     f8_off             = -22,
 181 
 182     x27_off            = -21,
 183     x26_off            = -20,
 184     x25_off            = -19,
 185     x24_off            = -18,
 186     x23_off            = -17,
 187     x22_off            = -16,
 188     x21_off            = -15,
 189     x20_off            = -14,
 190     x19_off            = -13,
 191     x18_off            = -12,
 192     x9_off             = -11,
 193 
 194     call_wrapper_off   = -10,
 195     result_off         = -9,
 196     result_type_off    = -8,
 197     method_off         = -7,
 198     entry_point_off    = -6,
 199     parameters_off     = -5,
 200     parameter_size_off = -4,
 201     thread_off         = -3,
 202     fp_f               = -2,
 203     retaddr_off        = -1,
 204   };
 205 
 206   address generate_call_stub(address& return_address) {
 207     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 208            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 209            "adjust this code");
 210 
 211     StubCodeMark mark(this, "StubRoutines", "call_stub");
 212     address start = __ pc();
 213 
 214     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 215 
 216     const Address frm_save      (fp, frm_off           * wordSize);
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     __ frrm(t0);
 300     __ sd(t0, frm_save);
 301     // Set frm to the state we need. We do want Round to Nearest. We
 302     // don't want non-IEEE rounding modes.
 303     Label skip_fsrmi;
 304     guarantee(__ RoundingMode::rne == 0, "must be");
 305     __ beqz(t0, skip_fsrmi);
 306     __ fsrmi(__ RoundingMode::rne);
 307     __ bind(skip_fsrmi);
 308 
 309     // install Java thread in global register now we have saved
 310     // whatever value it held
 311     __ mv(xthread, c_rarg7);
 312 
 313     // And method
 314     __ mv(xmethod, c_rarg3);
 315 
 316     // set up the heapbase register
 317     __ reinit_heapbase();
 318 
 319 #ifdef ASSERT
 320     // make sure we have no pending exceptions
 321     {
 322       Label L;
 323       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 324       __ beqz(t0, L);
 325       __ stop("StubRoutines::call_stub: entered with pending exception");
 326       __ BIND(L);
 327     }
 328 #endif
 329     // pass parameters if any
 330     __ mv(esp, sp);
 331     __ slli(t0, c_rarg6, LogBytesPerWord);
 332     __ sub(t0, sp, t0); // Move SP out of the way
 333     __ andi(sp, t0, -2 * wordSize);
 334 
 335     BLOCK_COMMENT("pass parameters if any");
 336     Label parameters_done;
 337     // parameter count is still in c_rarg6
 338     // and parameter pointer identifying param 1 is in c_rarg5
 339     __ beqz(c_rarg6, parameters_done);
 340 
 341     address loop = __ pc();
 342     __ ld(t0, Address(c_rarg5, 0));
 343     __ addi(c_rarg5, c_rarg5, wordSize);
 344     __ addi(c_rarg6, c_rarg6, -1);
 345     __ push_reg(t0);
 346     __ bgtz(c_rarg6, loop);
 347 
 348     __ BIND(parameters_done);
 349 
 350     // call Java entry -- passing methdoOop, and current sp
 351     //      xmethod: Method*
 352     //      x19_sender_sp: sender sp
 353     BLOCK_COMMENT("call Java function");
 354     __ mv(x19_sender_sp, sp);
 355     __ jalr(c_rarg4);
 356 
 357     // save current address for use by exception handling code
 358 
 359     return_address = __ pc();
 360 
 361     // store result depending on type (everything that is not
 362     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 363     // n.b. this assumes Java returns an integral result in x10
 364     // and a floating result in j_farg0
 365     __ ld(j_rarg2, result);
 366     Label is_long, is_float, is_double, exit;
 367     __ ld(j_rarg1, result_type);
 368     __ mv(t0, (u1)T_OBJECT);
 369     __ beq(j_rarg1, t0, is_long);
 370     __ mv(t0, (u1)T_LONG);
 371     __ beq(j_rarg1, t0, is_long);
 372     __ mv(t0, (u1)T_FLOAT);
 373     __ beq(j_rarg1, t0, is_float);
 374     __ mv(t0, (u1)T_DOUBLE);
 375     __ beq(j_rarg1, t0, is_double);
 376 
 377     // handle T_INT case
 378     __ sw(x10, Address(j_rarg2));
 379 
 380     __ BIND(exit);
 381 
 382     // pop parameters
 383     __ addi(esp, fp, sp_after_call_off * wordSize);
 384 
 385 #ifdef ASSERT
 386     // verify that threads correspond
 387     {
 388       Label L, S;
 389       __ ld(t0, thread);
 390       __ bne(xthread, t0, S);
 391       __ get_thread(t0);
 392       __ beq(xthread, t0, L);
 393       __ BIND(S);
 394       __ stop("StubRoutines::call_stub: threads must correspond");
 395       __ BIND(L);
 396     }
 397 #endif
 398 
 399     __ pop_cont_fastpath(xthread);
 400 
 401     // restore callee-save registers
 402     __ fld(f27, f27_save);
 403     __ fld(f26, f26_save);
 404     __ fld(f25, f25_save);
 405     __ fld(f24, f24_save);
 406     __ fld(f23, f23_save);
 407     __ fld(f22, f22_save);
 408     __ fld(f21, f21_save);
 409     __ fld(f20, f20_save);
 410     __ fld(f19, f19_save);
 411     __ fld(f18, f18_save);
 412     __ fld(f9,  f9_save);
 413     __ fld(f8,  f8_save);
 414 
 415     __ ld(x27, x27_save);
 416     __ ld(x26, x26_save);
 417     __ ld(x25, x25_save);
 418     __ ld(x24, x24_save);
 419     __ ld(x23, x23_save);
 420     __ ld(x22, x22_save);
 421     __ ld(x21, x21_save);
 422     __ ld(x20, x20_save);
 423     __ ld(x19, x19_save);
 424     __ ld(x18, x18_save);
 425 
 426     __ ld(x9, x9_save);
 427 
 428     // restore frm
 429     Label skip_fsrm;
 430     __ ld(t0, frm_save);
 431     __ frrm(t1);
 432     __ beq(t0, t1, skip_fsrm);
 433     __ fsrm(t0);
 434     __ bind(skip_fsrm);
 435 
 436     __ ld(c_rarg0, call_wrapper);
 437     __ ld(c_rarg1, result);
 438     __ ld(c_rarg2, result_type);
 439     __ ld(c_rarg3, method);
 440     __ ld(c_rarg4, entry_point);
 441     __ ld(c_rarg5, parameters);
 442     __ ld(c_rarg6, parameter_size);
 443     __ ld(c_rarg7, thread);
 444 
 445     // leave frame and return to caller
 446     __ leave();
 447     __ ret();
 448 
 449     // handle return types different from T_INT
 450 
 451     __ BIND(is_long);
 452     __ sd(x10, Address(j_rarg2, 0));
 453     __ j(exit);
 454 
 455     __ BIND(is_float);
 456     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 457     __ j(exit);
 458 
 459     __ BIND(is_double);
 460     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 461     __ j(exit);
 462 
 463     return start;
 464   }
 465 
 466   // Return point for a Java call if there's an exception thrown in
 467   // Java code.  The exception is caught and transformed into a
 468   // pending exception stored in JavaThread that can be tested from
 469   // within the VM.
 470   //
 471   // Note: Usually the parameters are removed by the callee. In case
 472   // of an exception crossing an activation frame boundary, that is
 473   // not the case if the callee is compiled code => need to setup the
 474   // sp.
 475   //
 476   // x10: exception oop
 477 
 478   address generate_catch_exception() {
 479     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 480     address start = __ pc();
 481 
 482     // same as in generate_call_stub():
 483     const Address thread(fp, thread_off * wordSize);
 484 
 485 #ifdef ASSERT
 486     // verify that threads correspond
 487     {
 488       Label L, S;
 489       __ ld(t0, thread);
 490       __ bne(xthread, t0, S);
 491       __ get_thread(t0);
 492       __ beq(xthread, t0, L);
 493       __ bind(S);
 494       __ stop("StubRoutines::catch_exception: threads must correspond");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // set pending exception
 500     __ verify_oop(x10);
 501 
 502     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 503     __ mv(t0, (address)__FILE__);
 504     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 505     __ mv(t0, (int)__LINE__);
 506     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 507 
 508     // complete return to VM
 509     assert(StubRoutines::_call_stub_return_address != nullptr,
 510            "_call_stub_return_address must have been generated before");
 511     __ j(StubRoutines::_call_stub_return_address);
 512 
 513     return start;
 514   }
 515 
 516   // Continuation point for runtime calls returning with a pending
 517   // exception.  The pending exception check happened in the runtime
 518   // or native call stub.  The pending exception in Thread is
 519   // converted into a Java-level exception.
 520   //
 521   // Contract with Java-level exception handlers:
 522   // x10: exception
 523   // x13: throwing pc
 524   //
 525   // NOTE: At entry of this stub, exception-pc must be in RA !!
 526 
 527   // NOTE: this is always used as a jump target within generated code
 528   // so it just needs to be generated code with no x86 prolog
 529 
 530   address generate_forward_exception() {
 531     StubCodeMark mark(this, "StubRoutines", "forward exception");
 532     address start = __ pc();
 533 
 534     // Upon entry, RA points to the return address returning into
 535     // Java (interpreted or compiled) code; i.e., the return address
 536     // becomes the throwing pc.
 537     //
 538     // Arguments pushed before the runtime call are still on the stack
 539     // but the exception handler will reset the stack pointer ->
 540     // ignore them.  A potential result in registers can be ignored as
 541     // well.
 542 
 543 #ifdef ASSERT
 544     // make sure this code is only executed if there is a pending exception
 545     {
 546       Label L;
 547       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 548       __ bnez(t0, L);
 549       __ stop("StubRoutines::forward exception: no pending exception (1)");
 550       __ bind(L);
 551     }
 552 #endif
 553 
 554     // compute exception handler into x9
 555 
 556     // call the VM to find the handler address associated with the
 557     // caller address. pass thread in x10 and caller pc (ret address)
 558     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 559     // the stack.
 560     __ mv(c_rarg1, ra);
 561     // ra will be trashed by the VM call so we move it to x9
 562     // (callee-saved) because we also need to pass it to the handler
 563     // returned by this call.
 564     __ mv(x9, ra);
 565     BLOCK_COMMENT("call exception_handler_for_return_address");
 566     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 567                          SharedRuntime::exception_handler_for_return_address),
 568                     xthread, c_rarg1);
 569     // we should not really care that ra is no longer the callee
 570     // address. we saved the value the handler needs in x9 so we can
 571     // just copy it to x13. however, the C2 handler will push its own
 572     // frame and then calls into the VM and the VM code asserts that
 573     // the PC for the frame above the handler belongs to a compiled
 574     // Java method. So, we restore ra here to satisfy that assert.
 575     __ mv(ra, x9);
 576     // setup x10 & x13 & clear pending exception
 577     __ mv(x13, x9);
 578     __ mv(x9, x10);
 579     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 580     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 581 
 582 #ifdef ASSERT
 583     // make sure exception is set
 584     {
 585       Label L;
 586       __ bnez(x10, L);
 587       __ stop("StubRoutines::forward exception: no pending exception (2)");
 588       __ bind(L);
 589     }
 590 #endif
 591 
 592     // continue at exception handler
 593     // x10: exception
 594     // x13: throwing pc
 595     // x9: exception handler
 596     __ verify_oop(x10);
 597     __ jr(x9);
 598 
 599     return start;
 600   }
 601 
 602   // Non-destructive plausibility checks for oops
 603   //
 604   // Arguments:
 605   //    x10: oop to verify
 606   //    t0: error message
 607   //
 608   // Stack after saving c_rarg3:
 609   //    [tos + 0]: saved c_rarg3
 610   //    [tos + 1]: saved c_rarg2
 611   //    [tos + 2]: saved ra
 612   //    [tos + 3]: saved t1
 613   //    [tos + 4]: saved x10
 614   //    [tos + 5]: saved t0
 615   address generate_verify_oop() {
 616 
 617     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 618     address start = __ pc();
 619 
 620     Label exit, error;
 621 
 622     __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
 623 
 624     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 625     __ ld(c_rarg3, Address(c_rarg2));
 626     __ add(c_rarg3, c_rarg3, 1);
 627     __ sd(c_rarg3, Address(c_rarg2));
 628 
 629     // object is in x10
 630     // make sure object is 'reasonable'
 631     __ beqz(x10, exit); // if obj is null it is OK
 632 
 633     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 634     bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
 635 
 636     // return if everything seems ok
 637     __ bind(exit);
 638 
 639     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp);  // pop c_rarg2 and c_rarg3
 640     __ ret();
 641 
 642     // handle errors
 643     __ bind(error);
 644     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
 645 
 646     __ push_reg(RegSet::range(x0, x31), sp);
 647     // debug(char* msg, int64_t pc, int64_t regs[])
 648     __ mv(c_rarg0, t0);             // pass address of error message
 649     __ mv(c_rarg1, ra);             // pass return address
 650     __ mv(c_rarg2, sp);             // pass address of regs on stack
 651 #ifndef PRODUCT
 652     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 653 #endif
 654     BLOCK_COMMENT("call MacroAssembler::debug");
 655     __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 656     __ ebreak();
 657 
 658     return start;
 659   }
 660 
 661   // The inner part of zero_words().
 662   //
 663   // Inputs:
 664   // x28: the HeapWord-aligned base address of an array to zero.
 665   // x29: the count in HeapWords, x29 > 0.
 666   //
 667   // Returns x28 and x29, adjusted for the caller to clear.
 668   // x28: the base address of the tail of words left to clear.
 669   // x29: the number of words in the tail.
 670   //      x29 < MacroAssembler::zero_words_block_size.
 671 
 672   address generate_zero_blocks() {
 673     Label done;
 674 
 675     const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
 676 
 677     __ align(CodeEntryAlignment);
 678     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 679     address start = __ pc();
 680 
 681     if (UseBlockZeroing) {
 682       // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero
 683       // after alignment.
 684       Label small;
 685       int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize;
 686       __ mv(tmp1, low_limit);
 687       __ blt(cnt, tmp1, small);
 688       __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
 689       __ bind(small);
 690     }
 691 
 692     {
 693       // Clear the remaining blocks.
 694       Label loop;
 695       __ mv(tmp1, MacroAssembler::zero_words_block_size);
 696       __ blt(cnt, tmp1, done);
 697       __ bind(loop);
 698       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 699         __ sd(zr, Address(base, i * wordSize));
 700       }
 701       __ add(base, base, MacroAssembler::zero_words_block_size * wordSize);
 702       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 703       __ bge(cnt, tmp1, loop);
 704       __ bind(done);
 705     }
 706 
 707     __ ret();
 708 
 709     return start;
 710   }
 711 
 712   typedef enum {
 713     copy_forwards = 1,
 714     copy_backwards = -1
 715   } copy_direction;
 716 
 717   // Bulk copy of blocks of 8 words.
 718   //
 719   // count is a count of words.
 720   //
 721   // Precondition: count >= 8
 722   //
 723   // Postconditions:
 724   //
 725   // The least significant bit of count contains the remaining count
 726   // of words to copy.  The rest of count is trash.
 727   //
 728   // s and d are adjusted to point to the remaining words to copy
 729   //
 730   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 731                            copy_direction direction) {
 732     int unit = wordSize * direction;
 733     int bias = wordSize;
 734 
 735     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 736       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 737 
 738     const Register stride = x30;
 739 
 740     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 741       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 742     assert_different_registers(s, d, count, t0);
 743 
 744     Label again, drain;
 745     const char* stub_name = nullptr;
 746     if (direction == copy_forwards) {
 747       stub_name = "forward_copy_longs";
 748     } else {
 749       stub_name = "backward_copy_longs";
 750     }
 751     StubCodeMark mark(this, "StubRoutines", stub_name);
 752     __ align(CodeEntryAlignment);
 753     __ bind(start);
 754 
 755     if (direction == copy_forwards) {
 756       __ sub(s, s, bias);
 757       __ sub(d, d, bias);
 758     }
 759 
 760 #ifdef ASSERT
 761     // Make sure we are never given < 8 words
 762     {
 763       Label L;
 764 
 765       __ mv(t0, 8);
 766       __ bge(count, t0, L);
 767       __ stop("genrate_copy_longs called with < 8 words");
 768       __ bind(L);
 769     }
 770 #endif
 771 
 772     __ ld(tmp_reg0, Address(s, 1 * unit));
 773     __ ld(tmp_reg1, Address(s, 2 * unit));
 774     __ ld(tmp_reg2, Address(s, 3 * unit));
 775     __ ld(tmp_reg3, Address(s, 4 * unit));
 776     __ ld(tmp_reg4, Address(s, 5 * unit));
 777     __ ld(tmp_reg5, Address(s, 6 * unit));
 778     __ ld(tmp_reg6, Address(s, 7 * unit));
 779     __ ld(tmp_reg7, Address(s, 8 * unit));
 780     __ addi(s, s, 8 * unit);
 781 
 782     __ sub(count, count, 16);
 783     __ bltz(count, drain);
 784 
 785     __ bind(again);
 786 
 787     __ sd(tmp_reg0, Address(d, 1 * unit));
 788     __ sd(tmp_reg1, Address(d, 2 * unit));
 789     __ sd(tmp_reg2, Address(d, 3 * unit));
 790     __ sd(tmp_reg3, Address(d, 4 * unit));
 791     __ sd(tmp_reg4, Address(d, 5 * unit));
 792     __ sd(tmp_reg5, Address(d, 6 * unit));
 793     __ sd(tmp_reg6, Address(d, 7 * unit));
 794     __ sd(tmp_reg7, Address(d, 8 * unit));
 795 
 796     __ ld(tmp_reg0, Address(s, 1 * unit));
 797     __ ld(tmp_reg1, Address(s, 2 * unit));
 798     __ ld(tmp_reg2, Address(s, 3 * unit));
 799     __ ld(tmp_reg3, Address(s, 4 * unit));
 800     __ ld(tmp_reg4, Address(s, 5 * unit));
 801     __ ld(tmp_reg5, Address(s, 6 * unit));
 802     __ ld(tmp_reg6, Address(s, 7 * unit));
 803     __ ld(tmp_reg7, Address(s, 8 * unit));
 804 
 805     __ addi(s, s, 8 * unit);
 806     __ addi(d, d, 8 * unit);
 807 
 808     __ sub(count, count, 8);
 809     __ bgez(count, again);
 810 
 811     // Drain
 812     __ bind(drain);
 813 
 814     __ sd(tmp_reg0, Address(d, 1 * unit));
 815     __ sd(tmp_reg1, Address(d, 2 * unit));
 816     __ sd(tmp_reg2, Address(d, 3 * unit));
 817     __ sd(tmp_reg3, Address(d, 4 * unit));
 818     __ sd(tmp_reg4, Address(d, 5 * unit));
 819     __ sd(tmp_reg5, Address(d, 6 * unit));
 820     __ sd(tmp_reg6, Address(d, 7 * unit));
 821     __ sd(tmp_reg7, Address(d, 8 * unit));
 822     __ addi(d, d, 8 * unit);
 823 
 824     {
 825       Label L1, L2;
 826       __ test_bit(t0, count, 2);
 827       __ beqz(t0, L1);
 828 
 829       __ ld(tmp_reg0, Address(s, 1 * unit));
 830       __ ld(tmp_reg1, Address(s, 2 * unit));
 831       __ ld(tmp_reg2, Address(s, 3 * unit));
 832       __ ld(tmp_reg3, Address(s, 4 * unit));
 833       __ addi(s, s, 4 * unit);
 834 
 835       __ sd(tmp_reg0, Address(d, 1 * unit));
 836       __ sd(tmp_reg1, Address(d, 2 * unit));
 837       __ sd(tmp_reg2, Address(d, 3 * unit));
 838       __ sd(tmp_reg3, Address(d, 4 * unit));
 839       __ addi(d, d, 4 * unit);
 840 
 841       __ bind(L1);
 842 
 843       if (direction == copy_forwards) {
 844         __ addi(s, s, bias);
 845         __ addi(d, d, bias);
 846       }
 847 
 848       __ test_bit(t0, count, 1);
 849       __ beqz(t0, L2);
 850       if (direction == copy_backwards) {
 851         __ addi(s, s, 2 * unit);
 852         __ ld(tmp_reg0, Address(s));
 853         __ ld(tmp_reg1, Address(s, wordSize));
 854         __ addi(d, d, 2 * unit);
 855         __ sd(tmp_reg0, Address(d));
 856         __ sd(tmp_reg1, Address(d, wordSize));
 857       } else {
 858         __ ld(tmp_reg0, Address(s));
 859         __ ld(tmp_reg1, Address(s, wordSize));
 860         __ addi(s, s, 2 * unit);
 861         __ sd(tmp_reg0, Address(d));
 862         __ sd(tmp_reg1, Address(d, wordSize));
 863         __ addi(d, d, 2 * unit);
 864       }
 865       __ bind(L2);
 866     }
 867 
 868     __ ret();
 869   }
 870 
 871   Label copy_f, copy_b;
 872 
 873   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 874 
 875   void copy_memory_v(Register s, Register d, Register count, int step) {
 876     bool is_backward = step < 0;
 877     int granularity = uabs(step);
 878 
 879     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 880     assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
 881     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 882     Label loop_forward, loop_backward, done;
 883 
 884     __ mv(dst, d);
 885     __ mv(src, s);
 886     __ mv(cnt, count);
 887 
 888     __ bind(loop_forward);
 889     __ vsetvli(vl, cnt, sew, Assembler::m8);
 890     if (is_backward) {
 891       __ bne(vl, cnt, loop_backward);
 892     }
 893 
 894     __ vlex_v(v0, src, sew);
 895     __ sub(cnt, cnt, vl);
 896     if (sew != Assembler::e8) {
 897       // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 898       __ slli(vl, vl, sew);
 899     }
 900     __ add(src, src, vl);
 901 
 902     __ vsex_v(v0, dst, sew);
 903     __ add(dst, dst, vl);
 904     __ bnez(cnt, loop_forward);
 905 
 906     if (is_backward) {
 907       __ j(done);
 908 
 909       __ bind(loop_backward);
 910       __ sub(t0, cnt, vl);
 911       if (sew != Assembler::e8) {
 912         // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 913         __ slli(t0, t0, sew);
 914       }
 915       __ add(tmp1, s, t0);
 916       __ vlex_v(v0, tmp1, sew);
 917       __ add(tmp2, d, t0);
 918       __ vsex_v(v0, tmp2, sew);
 919       __ sub(cnt, cnt, vl);
 920       __ bnez(cnt, loop_forward);
 921       __ bind(done);
 922     }
 923   }
 924 
 925   // All-singing all-dancing memory copy.
 926   //
 927   // Copy count units of memory from s to d.  The size of a unit is
 928   // step, which can be positive or negative depending on the direction
 929   // of copy.
 930   //
 931   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 932                    Register s, Register d, Register count, int step) {
 933     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 934     if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
 935       return copy_memory_v(s, d, count, step);
 936     }
 937 
 938     bool is_backwards = step < 0;
 939     int granularity = uabs(step);
 940 
 941     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 942     const Register gct1 = x28, gct2 = x29, gct3 = t2;
 943 
 944     Label same_aligned;
 945     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 946 
 947     // The size of copy32_loop body increases significantly with ZGC GC barriers.
 948     // Need conditional far branches to reach a point beyond the loop in this case.
 949     bool is_far = UseZGC && ZGenerational;
 950 
 951     __ beqz(count, done, is_far);
 952     __ slli(cnt, count, exact_log2(granularity));
 953     if (is_backwards) {
 954       __ add(src, s, cnt);
 955       __ add(dst, d, cnt);
 956     } else {
 957       __ mv(src, s);
 958       __ mv(dst, d);
 959     }
 960 
 961     if (is_aligned) {
 962       __ addi(t0, cnt, -32);
 963       __ bgez(t0, copy32_loop);
 964       __ addi(t0, cnt, -8);
 965       __ bgez(t0, copy8_loop, is_far);
 966       __ j(copy_small);
 967     } else {
 968       __ mv(t0, 16);
 969       __ blt(cnt, t0, copy_small, is_far);
 970 
 971       __ xorr(t0, src, dst);
 972       __ andi(t0, t0, 0b111);
 973       __ bnez(t0, copy_small, is_far);
 974 
 975       __ bind(same_aligned);
 976       __ andi(t0, src, 0b111);
 977       __ beqz(t0, copy_big);
 978       if (is_backwards) {
 979         __ addi(src, src, step);
 980         __ addi(dst, dst, step);
 981       }
 982       bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
 983       bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
 984       if (!is_backwards) {
 985         __ addi(src, src, step);
 986         __ addi(dst, dst, step);
 987       }
 988       __ addi(cnt, cnt, -granularity);
 989       __ beqz(cnt, done, is_far);
 990       __ j(same_aligned);
 991 
 992       __ bind(copy_big);
 993       __ mv(t0, 32);
 994       __ blt(cnt, t0, copy8_loop, is_far);
 995     }
 996 
 997     __ bind(copy32_loop);
 998     if (is_backwards) {
 999       __ addi(src, src, -wordSize * 4);
1000       __ addi(dst, dst, -wordSize * 4);
1001     }
1002     // we first load 32 bytes, then write it, so the direction here doesn't matter
1003     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src),     gct1);
1004     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8),  gct1);
1005     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1006     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1007 
1008     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst),     tmp3, gct1, gct2, gct3);
1009     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8),  tmp4, gct1, gct2, gct3);
1010     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1011     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1012 
1013     if (!is_backwards) {
1014       __ addi(src, src, wordSize * 4);
1015       __ addi(dst, dst, wordSize * 4);
1016     }
1017     __ addi(t0, cnt, -(32 + wordSize * 4));
1018     __ addi(cnt, cnt, -wordSize * 4);
1019     __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1020 
1021     __ beqz(cnt, done); // if that's all - done
1022 
1023     __ addi(t0, cnt, -8); // if not - copy the reminder
1024     __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1025 
1026     __ bind(copy8_loop);
1027     if (is_backwards) {
1028       __ addi(src, src, -wordSize);
1029       __ addi(dst, dst, -wordSize);
1030     }
1031     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1032     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1033 
1034     if (!is_backwards) {
1035       __ addi(src, src, wordSize);
1036       __ addi(dst, dst, wordSize);
1037     }
1038     __ addi(t0, cnt, -(8 + wordSize));
1039     __ addi(cnt, cnt, -wordSize);
1040     __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1041 
1042     __ beqz(cnt, done); // if that's all - done
1043 
1044     __ bind(copy_small);
1045     if (is_backwards) {
1046       __ addi(src, src, step);
1047       __ addi(dst, dst, step);
1048     }
1049 
1050     bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1051     bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1052 
1053     if (!is_backwards) {
1054       __ addi(src, src, step);
1055       __ addi(dst, dst, step);
1056     }
1057     __ addi(cnt, cnt, -granularity);
1058     __ bgtz(cnt, copy_small);
1059 
1060     __ bind(done);
1061   }
1062 
1063   // Scan over array at a for count oops, verifying each one.
1064   // Preserves a and count, clobbers t0 and t1.
1065   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1066     Label loop, end;
1067     __ mv(t1, zr);
1068     __ slli(t0, count, exact_log2(size));
1069     __ bind(loop);
1070     __ bgeu(t1, t0, end);
1071 
1072     __ add(temp, a, t1);
1073     if (size == (size_t)wordSize) {
1074       __ ld(temp, Address(temp, 0));
1075       __ verify_oop(temp);
1076     } else {
1077       __ lwu(temp, Address(temp, 0));
1078       __ decode_heap_oop(temp); // calls verify_oop
1079     }
1080     __ add(t1, t1, size);
1081     __ j(loop);
1082     __ bind(end);
1083   }
1084 
1085   // Arguments:
1086   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1087   //             ignored
1088   //   is_oop  - true => oop array, so generate store check code
1089   //   name    - stub name string
1090   //
1091   // Inputs:
1092   //   c_rarg0   - source array address
1093   //   c_rarg1   - destination array address
1094   //   c_rarg2   - element count, treated as ssize_t, can be zero
1095   //
1096   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1097   // the hardware handle it.  The two dwords within qwords that span
1098   // cache line boundaries will still be loaded and stored atomically.
1099   //
1100   // Side Effects:
1101   //   disjoint_int_copy_entry is set to the no-overlap entry point
1102   //   used by generate_conjoint_int_oop_copy().
1103   //
1104   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry,
1105                                  const char* name, bool dest_uninitialized = false) {
1106     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1107     RegSet saved_reg = RegSet::of(s, d, count);
1108     __ align(CodeEntryAlignment);
1109     StubCodeMark mark(this, "StubRoutines", name);
1110     address start = __ pc();
1111     __ enter();
1112 
1113     if (entry != nullptr) {
1114       *entry = __ pc();
1115       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1116       BLOCK_COMMENT("Entry:");
1117     }
1118 
1119     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1120     if (dest_uninitialized) {
1121       decorators |= IS_DEST_UNINITIALIZED;
1122     }
1123     if (aligned) {
1124       decorators |= ARRAYCOPY_ALIGNED;
1125     }
1126 
1127     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1128     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1129 
1130     if (is_oop) {
1131       // save regs before copy_memory
1132       __ push_reg(RegSet::of(d, count), sp);
1133     }
1134 
1135     {
1136       // UnsafeMemoryAccess page error: continue after unsafe access
1137       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1138       UnsafeMemoryAccessMark umam(this, add_entry, true);
1139       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1140     }
1141 
1142     if (is_oop) {
1143       __ pop_reg(RegSet::of(d, count), sp);
1144       if (VerifyOops) {
1145         verify_oop_array(size, d, count, t2);
1146       }
1147     }
1148 
1149     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1150 
1151     __ leave();
1152     __ mv(x10, zr); // return 0
1153     __ ret();
1154     return start;
1155   }
1156 
1157   // Arguments:
1158   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1159   //             ignored
1160   //   is_oop  - true => oop array, so generate store check code
1161   //   name    - stub name string
1162   //
1163   // Inputs:
1164   //   c_rarg0   - source array address
1165   //   c_rarg1   - destination array address
1166   //   c_rarg2   - element count, treated as ssize_t, can be zero
1167   //
1168   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1169   // the hardware handle it.  The two dwords within qwords that span
1170   // cache line boundaries will still be loaded and stored atomically.
1171   //
1172   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1173                                  address* entry, const char* name,
1174                                  bool dest_uninitialized = false) {
1175     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1176     RegSet saved_regs = RegSet::of(s, d, count);
1177     StubCodeMark mark(this, "StubRoutines", name);
1178     address start = __ pc();
1179     __ enter();
1180 
1181     if (entry != nullptr) {
1182       *entry = __ pc();
1183       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1184       BLOCK_COMMENT("Entry:");
1185     }
1186 
1187     // use fwd copy when (d-s) above_equal (count*size)
1188     __ sub(t0, d, s);
1189     __ slli(t1, count, exact_log2(size));
1190     Label L_continue;
1191     __ bltu(t0, t1, L_continue);
1192     __ j(nooverlap_target);
1193     __ bind(L_continue);
1194 
1195     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1196     if (dest_uninitialized) {
1197       decorators |= IS_DEST_UNINITIALIZED;
1198     }
1199     if (aligned) {
1200       decorators |= ARRAYCOPY_ALIGNED;
1201     }
1202 
1203     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1204     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1205 
1206     if (is_oop) {
1207       // save regs before copy_memory
1208       __ push_reg(RegSet::of(d, count), sp);
1209     }
1210 
1211     {
1212       // UnsafeMemoryAccess page error: continue after unsafe access
1213       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1214       UnsafeMemoryAccessMark umam(this, add_entry, true);
1215       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1216     }
1217 
1218     if (is_oop) {
1219       __ pop_reg(RegSet::of(d, count), sp);
1220       if (VerifyOops) {
1221         verify_oop_array(size, d, count, t2);
1222       }
1223     }
1224     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1225     __ leave();
1226     __ mv(x10, zr); // return 0
1227     __ ret();
1228     return start;
1229   }
1230 
1231   // Arguments:
1232   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1233   //             ignored
1234   //   name    - stub name string
1235   //
1236   // Inputs:
1237   //   c_rarg0   - source array address
1238   //   c_rarg1   - destination array address
1239   //   c_rarg2   - element count, treated as ssize_t, can be zero
1240   //
1241   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1242   // we let the hardware handle it.  The one to eight bytes within words,
1243   // dwords or qwords that span cache line boundaries will still be loaded
1244   // and stored atomically.
1245   //
1246   // Side Effects:
1247   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1248   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1249   // we let the hardware handle it.  The one to eight bytes within words,
1250   // dwords or qwords that span cache line boundaries will still be loaded
1251   // and stored atomically.
1252   //
1253   // Side Effects:
1254   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1255   //   used by generate_conjoint_byte_copy().
1256   //
1257   address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) {
1258     const bool not_oop = false;
1259     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1260   }
1261 
1262   // Arguments:
1263   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1264   //             ignored
1265   //   name    - stub name string
1266   //
1267   // Inputs:
1268   //   c_rarg0   - source array address
1269   //   c_rarg1   - destination array address
1270   //   c_rarg2   - element count, treated as ssize_t, can be zero
1271   //
1272   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1273   // we let the hardware handle it.  The one to eight bytes within words,
1274   // dwords or qwords that span cache line boundaries will still be loaded
1275   // and stored atomically.
1276   //
1277   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1278                                       address* entry, const char* name) {
1279     const bool not_oop = false;
1280     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1281   }
1282 
1283   // Arguments:
1284   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1285   //             ignored
1286   //   name    - stub name string
1287   //
1288   // Inputs:
1289   //   c_rarg0   - source array address
1290   //   c_rarg1   - destination array address
1291   //   c_rarg2   - element count, treated as ssize_t, can be zero
1292   //
1293   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1294   // let the hardware handle it.  The two or four words within dwords
1295   // or qwords that span cache line boundaries will still be loaded
1296   // and stored atomically.
1297   //
1298   // Side Effects:
1299   //   disjoint_short_copy_entry is set to the no-overlap entry point
1300   //   used by generate_conjoint_short_copy().
1301   //
1302   address generate_disjoint_short_copy(bool aligned,
1303                                        address* entry, const char* name) {
1304     const bool not_oop = false;
1305     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1306   }
1307 
1308   // Arguments:
1309   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1310   //             ignored
1311   //   name    - stub name string
1312   //
1313   // Inputs:
1314   //   c_rarg0   - source array address
1315   //   c_rarg1   - destination array address
1316   //   c_rarg2   - element count, treated as ssize_t, can be zero
1317   //
1318   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1319   // let the hardware handle it.  The two or four words within dwords
1320   // or qwords that span cache line boundaries will still be loaded
1321   // and stored atomically.
1322   //
1323   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1324                                        address* entry, const char* name) {
1325     const bool not_oop = false;
1326     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1327   }
1328 
1329   // Arguments:
1330   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1331   //             ignored
1332   //   name    - stub name string
1333   //
1334   // Inputs:
1335   //   c_rarg0   - source array address
1336   //   c_rarg1   - destination array address
1337   //   c_rarg2   - element count, treated as ssize_t, can be zero
1338   //
1339   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1340   // the hardware handle it.  The two dwords within qwords that span
1341   // cache line boundaries will still be loaded and stored atomically.
1342   //
1343   // Side Effects:
1344   //   disjoint_int_copy_entry is set to the no-overlap entry point
1345   //   used by generate_conjoint_int_oop_copy().
1346   //
1347   address generate_disjoint_int_copy(bool aligned, address* entry,
1348                                      const char* name, bool dest_uninitialized = false) {
1349     const bool not_oop = false;
1350     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1351   }
1352 
1353   // Arguments:
1354   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1355   //             ignored
1356   //   name    - stub name string
1357   //
1358   // Inputs:
1359   //   c_rarg0   - source array address
1360   //   c_rarg1   - destination array address
1361   //   c_rarg2   - element count, treated as ssize_t, can be zero
1362   //
1363   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1364   // the hardware handle it.  The two dwords within qwords that span
1365   // cache line boundaries will still be loaded and stored atomically.
1366   //
1367   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1368                                      address* entry, const char* name,
1369                                      bool dest_uninitialized = false) {
1370     const bool not_oop = false;
1371     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1372   }
1373 
1374 
1375   // Arguments:
1376   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1377   //             ignored
1378   //   name    - stub name string
1379   //
1380   // Inputs:
1381   //   c_rarg0   - source array address
1382   //   c_rarg1   - destination array address
1383   //   c_rarg2   - element count, treated as size_t, can be zero
1384   //
1385   // Side Effects:
1386   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1387   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1388   //
1389   address generate_disjoint_long_copy(bool aligned, address* entry,
1390                                       const char* name, bool dest_uninitialized = false) {
1391     const bool not_oop = false;
1392     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1393   }
1394 
1395   // Arguments:
1396   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1397   //             ignored
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as size_t, can be zero
1404   //
1405   address generate_conjoint_long_copy(bool aligned,
1406                                       address nooverlap_target, address* entry,
1407                                       const char* name, bool dest_uninitialized = false) {
1408     const bool not_oop = false;
1409     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1410   }
1411 
1412   // Arguments:
1413   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1414   //             ignored
1415   //   name    - stub name string
1416   //
1417   // Inputs:
1418   //   c_rarg0   - source array address
1419   //   c_rarg1   - destination array address
1420   //   c_rarg2   - element count, treated as size_t, can be zero
1421   //
1422   // Side Effects:
1423   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1424   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1425   //
1426   address generate_disjoint_oop_copy(bool aligned, address* entry,
1427                                      const char* name, bool dest_uninitialized) {
1428     const bool is_oop = true;
1429     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1430     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1431   }
1432 
1433   // Arguments:
1434   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1435   //             ignored
1436   //   name    - stub name string
1437   //
1438   // Inputs:
1439   //   c_rarg0   - source array address
1440   //   c_rarg1   - destination array address
1441   //   c_rarg2   - element count, treated as size_t, can be zero
1442   //
1443   address generate_conjoint_oop_copy(bool aligned,
1444                                      address nooverlap_target, address* entry,
1445                                      const char* name, bool dest_uninitialized) {
1446     const bool is_oop = true;
1447     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1448     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1449                                   name, dest_uninitialized);
1450   }
1451 
1452   // Helper for generating a dynamic type check.
1453   // Smashes t0, t1.
1454   void generate_type_check(Register sub_klass,
1455                            Register super_check_offset,
1456                            Register super_klass,
1457                            Label& L_success) {
1458     assert_different_registers(sub_klass, super_check_offset, super_klass);
1459 
1460     BLOCK_COMMENT("type_check:");
1461 
1462     Label L_miss;
1463 
1464     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1465     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1466 
1467     // Fall through on failure!
1468     __ BIND(L_miss);
1469   }
1470 
1471   //
1472   //  Generate checkcasting array copy stub
1473   //
1474   //  Input:
1475   //    c_rarg0   - source array address
1476   //    c_rarg1   - destination array address
1477   //    c_rarg2   - element count, treated as ssize_t, can be zero
1478   //    c_rarg3   - size_t ckoff (super_check_offset)
1479   //    c_rarg4   - oop ckval (super_klass)
1480   //
1481   //  Output:
1482   //    x10 ==  0  -  success
1483   //    x10 == -1^K - failure, where K is partial transfer count
1484   //
1485   address generate_checkcast_copy(const char* name, address* entry,
1486                                   bool dest_uninitialized = false) {
1487     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1488 
1489     // Input registers (after setup_arg_regs)
1490     const Register from        = c_rarg0;   // source array address
1491     const Register to          = c_rarg1;   // destination array address
1492     const Register count       = c_rarg2;   // elementscount
1493     const Register ckoff       = c_rarg3;   // super_check_offset
1494     const Register ckval       = c_rarg4;   // super_klass
1495 
1496     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1497     RegSet wb_post_saved_regs  = RegSet::of(count);
1498 
1499     // Registers used as temps (x7, x9, x18 are save-on-entry)
1500     const Register count_save  = x19;       // orig elementscount
1501     const Register start_to    = x18;       // destination array start address
1502     const Register copied_oop  = x7;        // actual oop copied
1503     const Register r9_klass    = x9;        // oop._klass
1504 
1505     // Registers used as gc temps (x15, x16, x17 are save-on-call)
1506     const Register gct1 = x15, gct2 = x16, gct3 = x17;
1507 
1508     //---------------------------------------------------------------
1509     // Assembler stub will be used for this call to arraycopy
1510     // if the two arrays are subtypes of Object[] but the
1511     // destination array type is not equal to or a supertype
1512     // of the source type.  Each element must be separately
1513     // checked.
1514 
1515     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1516                                copied_oop, r9_klass, count_save);
1517 
1518     __ align(CodeEntryAlignment);
1519     StubCodeMark mark(this, "StubRoutines", name);
1520     address start = __ pc();
1521 
1522     __ enter(); // required for proper stackwalking of RuntimeStub frame
1523 
1524     // Caller of this entry point must set up the argument registers.
1525     if (entry != nullptr) {
1526       *entry = __ pc();
1527       BLOCK_COMMENT("Entry:");
1528     }
1529 
1530     // Empty array:  Nothing to do
1531     __ beqz(count, L_done);
1532 
1533     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1534 
1535 #ifdef ASSERT
1536     BLOCK_COMMENT("assert consistent ckoff/ckval");
1537     // The ckoff and ckval must be mutually consistent,
1538     // even though caller generates both.
1539     { Label L;
1540       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1541       __ lwu(start_to, Address(ckval, sco_offset));
1542       __ beq(ckoff, start_to, L);
1543       __ stop("super_check_offset inconsistent");
1544       __ bind(L);
1545     }
1546 #endif //ASSERT
1547 
1548     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1549     if (dest_uninitialized) {
1550       decorators |= IS_DEST_UNINITIALIZED;
1551     }
1552 
1553     bool is_oop = true;
1554     int element_size = UseCompressedOops ? 4 : 8;
1555 
1556     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1557     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1558 
1559     // save the original count
1560     __ mv(count_save, count);
1561 
1562     // Copy from low to high addresses
1563     __ mv(start_to, to);              // Save destination array start address
1564     __ j(L_load_element);
1565 
1566     // ======== begin loop ========
1567     // (Loop is rotated; its entry is L_load_element.)
1568     // Loop control:
1569     //   for count to 0 do
1570     //     copied_oop = load_heap_oop(from++)
1571     //     ... generate_type_check ...
1572     //     store_heap_oop(to++, copied_oop)
1573     //   end
1574 
1575     __ align(OptoLoopAlignment);
1576 
1577     __ BIND(L_store_element);
1578     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1579                       Address(to, 0), copied_oop,
1580                       gct1, gct2, gct3);
1581     __ add(to, to, UseCompressedOops ? 4 : 8);
1582     __ sub(count, count, 1);
1583     __ beqz(count, L_do_card_marks);
1584 
1585     // ======== loop entry is here ========
1586     __ BIND(L_load_element);
1587     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1588                      copied_oop, Address(from, 0),
1589                      gct1);
1590     __ add(from, from, UseCompressedOops ? 4 : 8);
1591     __ beqz(copied_oop, L_store_element);
1592 
1593     __ load_klass(r9_klass, copied_oop);// query the object klass
1594     generate_type_check(r9_klass, ckoff, ckval, L_store_element);
1595     // ======== end loop ========
1596 
1597     // It was a real error; we must depend on the caller to finish the job.
1598     // Register count = remaining oops, count_orig = total oops.
1599     // Emit GC store barriers for the oops we have copied and report
1600     // their number to the caller.
1601 
1602     __ sub(count, count_save, count);     // K = partially copied oop count
1603     __ xori(count, count, -1);                   // report (-1^K) to caller
1604     __ beqz(count, L_done_pop);
1605 
1606     __ BIND(L_do_card_marks);
1607     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
1608 
1609     __ bind(L_done_pop);
1610     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1611     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1612 
1613     __ bind(L_done);
1614     __ mv(x10, count);
1615     __ leave();
1616     __ ret();
1617 
1618     return start;
1619   }
1620 
1621   // Perform range checks on the proposed arraycopy.
1622   // Kills temp, but nothing else.
1623   // Also, clean the sign bits of src_pos and dst_pos.
1624   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1625                               Register src_pos, // source position (c_rarg1)
1626                               Register dst,     // destination array oo (c_rarg2)
1627                               Register dst_pos, // destination position (c_rarg3)
1628                               Register length,
1629                               Register temp,
1630                               Label& L_failed) {
1631     BLOCK_COMMENT("arraycopy_range_checks:");
1632 
1633     assert_different_registers(t0, temp);
1634 
1635     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1636     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1637     __ addw(temp, length, src_pos);
1638     __ bgtu(temp, t0, L_failed);
1639 
1640     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1641     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1642     __ addw(temp, length, dst_pos);
1643     __ bgtu(temp, t0, L_failed);
1644 
1645     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1646     __ zero_extend(src_pos, src_pos, 32);
1647     __ zero_extend(dst_pos, dst_pos, 32);
1648 
1649     BLOCK_COMMENT("arraycopy_range_checks done");
1650   }
1651 
1652   //
1653   //  Generate 'unsafe' array copy stub
1654   //  Though just as safe as the other stubs, it takes an unscaled
1655   //  size_t argument instead of an element count.
1656   //
1657   //  Input:
1658   //    c_rarg0   - source array address
1659   //    c_rarg1   - destination array address
1660   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1661   //
1662   // Examines the alignment of the operands and dispatches
1663   // to a long, int, short, or byte copy loop.
1664   //
1665   address generate_unsafe_copy(const char* name,
1666                                address byte_copy_entry,
1667                                address short_copy_entry,
1668                                address int_copy_entry,
1669                                address long_copy_entry) {
1670     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1671                 int_copy_entry != nullptr && long_copy_entry != nullptr);
1672     Label L_long_aligned, L_int_aligned, L_short_aligned;
1673     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1674 
1675     __ align(CodeEntryAlignment);
1676     StubCodeMark mark(this, "StubRoutines", name);
1677     address start = __ pc();
1678     __ enter(); // required for proper stackwalking of RuntimeStub frame
1679 
1680     // bump this on entry, not on exit:
1681     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1682 
1683     __ orr(t0, s, d);
1684     __ orr(t0, t0, count);
1685 
1686     __ andi(t0, t0, BytesPerLong - 1);
1687     __ beqz(t0, L_long_aligned);
1688     __ andi(t0, t0, BytesPerInt - 1);
1689     __ beqz(t0, L_int_aligned);
1690     __ test_bit(t0, t0, 0);
1691     __ beqz(t0, L_short_aligned);
1692     __ j(RuntimeAddress(byte_copy_entry));
1693 
1694     __ BIND(L_short_aligned);
1695     __ srli(count, count, LogBytesPerShort);  // size => short_count
1696     __ j(RuntimeAddress(short_copy_entry));
1697     __ BIND(L_int_aligned);
1698     __ srli(count, count, LogBytesPerInt);    // size => int_count
1699     __ j(RuntimeAddress(int_copy_entry));
1700     __ BIND(L_long_aligned);
1701     __ srli(count, count, LogBytesPerLong);   // size => long_count
1702     __ j(RuntimeAddress(long_copy_entry));
1703 
1704     return start;
1705   }
1706 
1707   //
1708   //  Generate generic array copy stubs
1709   //
1710   //  Input:
1711   //    c_rarg0    -  src oop
1712   //    c_rarg1    -  src_pos (32-bits)
1713   //    c_rarg2    -  dst oop
1714   //    c_rarg3    -  dst_pos (32-bits)
1715   //    c_rarg4    -  element count (32-bits)
1716   //
1717   //  Output:
1718   //    x10 ==  0  -  success
1719   //    x10 == -1^K - failure, where K is partial transfer count
1720   //
1721   address generate_generic_copy(const char* name,
1722                                 address byte_copy_entry, address short_copy_entry,
1723                                 address int_copy_entry, address oop_copy_entry,
1724                                 address long_copy_entry, address checkcast_copy_entry) {
1725     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1726                 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1727                 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1728     Label L_failed, L_failed_0, L_objArray;
1729     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1730 
1731     // Input registers
1732     const Register src        = c_rarg0;  // source array oop
1733     const Register src_pos    = c_rarg1;  // source position
1734     const Register dst        = c_rarg2;  // destination array oop
1735     const Register dst_pos    = c_rarg3;  // destination position
1736     const Register length     = c_rarg4;
1737 
1738     // Registers used as temps
1739     const Register dst_klass = c_rarg5;
1740 
1741     __ align(CodeEntryAlignment);
1742 
1743     StubCodeMark mark(this, "StubRoutines", name);
1744 
1745     address start = __ pc();
1746 
1747     __ enter(); // required for proper stackwalking of RuntimeStub frame
1748 
1749     // bump this on entry, not on exit:
1750     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1751 
1752     //-----------------------------------------------------------------------
1753     // Assembler stub will be used for this call to arraycopy
1754     // if the following conditions are met:
1755     //
1756     // (1) src and dst must not be null.
1757     // (2) src_pos must not be negative.
1758     // (3) dst_pos must not be negative.
1759     // (4) length  must not be negative.
1760     // (5) src klass and dst klass should be the same and not null.
1761     // (6) src and dst should be arrays.
1762     // (7) src_pos + length must not exceed length of src.
1763     // (8) dst_pos + length must not exceed length of dst.
1764     //
1765 
1766     // if src is null then return -1
1767     __ beqz(src, L_failed);
1768 
1769     // if [src_pos < 0] then return -1
1770     __ sign_extend(t0, src_pos, 32);
1771     __ bltz(t0, L_failed);
1772 
1773     // if dst is null then return -1
1774     __ beqz(dst, L_failed);
1775 
1776     // if [dst_pos < 0] then return -1
1777     __ sign_extend(t0, dst_pos, 32);
1778     __ bltz(t0, L_failed);
1779 
1780     // registers used as temp
1781     const Register scratch_length    = x28; // elements count to copy
1782     const Register scratch_src_klass = x29; // array klass
1783     const Register lh                = x30; // layout helper
1784 
1785     // if [length < 0] then return -1
1786     __ sign_extend(scratch_length, length, 32);    // length (elements count, 32-bits value)
1787     __ bltz(scratch_length, L_failed);
1788 
1789     __ load_klass(scratch_src_klass, src);
1790 #ifdef ASSERT
1791     {
1792       BLOCK_COMMENT("assert klasses not null {");
1793       Label L1, L2;
1794       __ bnez(scratch_src_klass, L2);   // it is broken if klass is null
1795       __ bind(L1);
1796       __ stop("broken null klass");
1797       __ bind(L2);
1798       __ load_klass(t0, dst, t1);
1799       __ beqz(t0, L1);     // this would be broken also
1800       BLOCK_COMMENT("} assert klasses not null done");
1801     }
1802 #endif
1803 
1804     // Load layout helper (32-bits)
1805     //
1806     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1807     // 32        30    24            16              8     2                 0
1808     //
1809     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1810     //
1811 
1812     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1813 
1814     // Handle objArrays completely differently...
1815     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1816     __ lw(lh, Address(scratch_src_klass, lh_offset));
1817     __ mv(t0, objArray_lh);
1818     __ beq(lh, t0, L_objArray);
1819 
1820     // if [src->klass() != dst->klass()] then return -1
1821     __ load_klass(t1, dst);
1822     __ bne(t1, scratch_src_klass, L_failed);
1823 
1824     // if src->is_Array() isn't null then return -1
1825     // i.e. (lh >= 0)
1826     __ bgez(lh, L_failed);
1827 
1828     // At this point, it is known to be a typeArray (array_tag 0x3).
1829 #ifdef ASSERT
1830     {
1831       BLOCK_COMMENT("assert primitive array {");
1832       Label L;
1833       __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1834       __ bge(lh, t1, L);
1835       __ stop("must be a primitive array");
1836       __ bind(L);
1837       BLOCK_COMMENT("} assert primitive array done");
1838     }
1839 #endif
1840 
1841     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1842                            t1, L_failed);
1843 
1844     // TypeArrayKlass
1845     //
1846     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1847     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1848     //
1849 
1850     const Register t0_offset = t0;    // array offset
1851     const Register x30_elsize = lh;   // element size
1852 
1853     // Get array_header_in_bytes()
1854     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1855     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1856     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1857     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1858 
1859     __ add(src, src, t0_offset);           // src array offset
1860     __ add(dst, dst, t0_offset);           // dst array offset
1861     BLOCK_COMMENT("choose copy loop based on element size");
1862 
1863     // next registers should be set before the jump to corresponding stub
1864     const Register from     = c_rarg0;  // source array address
1865     const Register to       = c_rarg1;  // destination array address
1866     const Register count    = c_rarg2;  // elements count
1867 
1868     // 'from', 'to', 'count' registers should be set in such order
1869     // since they are the same as 'src', 'src_pos', 'dst'.
1870 
1871     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1872 
1873     // The possible values of elsize are 0-3, i.e. exact_log2(element
1874     // size in bytes).  We do a simple bitwise binary search.
1875   __ BIND(L_copy_bytes);
1876     __ test_bit(t0, x30_elsize, 1);
1877     __ bnez(t0, L_copy_ints);
1878     __ test_bit(t0, x30_elsize, 0);
1879     __ bnez(t0, L_copy_shorts);
1880     __ add(from, src, src_pos); // src_addr
1881     __ add(to, dst, dst_pos); // dst_addr
1882     __ sign_extend(count, scratch_length, 32); // length
1883     __ j(RuntimeAddress(byte_copy_entry));
1884 
1885   __ BIND(L_copy_shorts);
1886     __ shadd(from, src_pos, src, t0, 1); // src_addr
1887     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1888     __ sign_extend(count, scratch_length, 32); // length
1889     __ j(RuntimeAddress(short_copy_entry));
1890 
1891   __ BIND(L_copy_ints);
1892     __ test_bit(t0, x30_elsize, 0);
1893     __ bnez(t0, L_copy_longs);
1894     __ shadd(from, src_pos, src, t0, 2); // src_addr
1895     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
1896     __ sign_extend(count, scratch_length, 32); // length
1897     __ j(RuntimeAddress(int_copy_entry));
1898 
1899   __ BIND(L_copy_longs);
1900 #ifdef ASSERT
1901     {
1902       BLOCK_COMMENT("assert long copy {");
1903       Label L;
1904       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
1905       __ sign_extend(lh, lh, 32);
1906       __ mv(t0, LogBytesPerLong);
1907       __ beq(x30_elsize, t0, L);
1908       __ stop("must be long copy, but elsize is wrong");
1909       __ bind(L);
1910       BLOCK_COMMENT("} assert long copy done");
1911     }
1912 #endif
1913     __ shadd(from, src_pos, src, t0, 3); // src_addr
1914     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
1915     __ sign_extend(count, scratch_length, 32); // length
1916     __ j(RuntimeAddress(long_copy_entry));
1917 
1918     // ObjArrayKlass
1919   __ BIND(L_objArray);
1920     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1921 
1922     Label L_plain_copy, L_checkcast_copy;
1923     // test array classes for subtyping
1924     __ load_klass(t2, dst);
1925     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
1926 
1927     // Identically typed arrays can be copied without element-wise checks.
1928     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1929                            t1, L_failed);
1930 
1931     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1932     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1933     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1934     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1935     __ sign_extend(count, scratch_length, 32); // length
1936   __ BIND(L_plain_copy);
1937     __ j(RuntimeAddress(oop_copy_entry));
1938 
1939   __ BIND(L_checkcast_copy);
1940     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
1941     {
1942       // Before looking at dst.length, make sure dst is also an objArray.
1943       __ lwu(t0, Address(t2, lh_offset));
1944       __ mv(t1, objArray_lh);
1945       __ bne(t0, t1, L_failed);
1946 
1947       // It is safe to examine both src.length and dst.length.
1948       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1949                              t2, L_failed);
1950 
1951       __ load_klass(dst_klass, dst); // reload
1952 
1953       // Marshal the base address arguments now, freeing registers.
1954       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1955       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1956       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1957       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1958       __ sign_extend(count, length, 32);      // length (reloaded)
1959       const Register sco_temp = c_rarg3;      // this register is free now
1960       assert_different_registers(from, to, count, sco_temp,
1961                                  dst_klass, scratch_src_klass);
1962 
1963       // Generate the type check.
1964       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1965       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1966 
1967       // Smashes t0, t1
1968       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
1969 
1970       // Fetch destination element klass from the ObjArrayKlass header.
1971       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1972       __ ld(dst_klass, Address(dst_klass, ek_offset));
1973       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1974 
1975       // the checkcast_copy loop needs two extra arguments:
1976       assert(c_rarg3 == sco_temp, "#3 already in place");
1977       // Set up arguments for checkcast_copy_entry.
1978       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
1979       __ j(RuntimeAddress(checkcast_copy_entry));
1980     }
1981 
1982   __ BIND(L_failed);
1983     __ mv(x10, -1);
1984     __ leave();   // required for proper stackwalking of RuntimeStub frame
1985     __ ret();
1986 
1987     return start;
1988   }
1989 
1990   //
1991   // Generate stub for array fill. If "aligned" is true, the
1992   // "to" address is assumed to be heapword aligned.
1993   //
1994   // Arguments for generated stub:
1995   //   to:    c_rarg0
1996   //   value: c_rarg1
1997   //   count: c_rarg2 treated as signed
1998   //
1999   address generate_fill(BasicType t, bool aligned, const char* name) {
2000     __ align(CodeEntryAlignment);
2001     StubCodeMark mark(this, "StubRoutines", name);
2002     address start = __ pc();
2003 
2004     BLOCK_COMMENT("Entry:");
2005 
2006     const Register to        = c_rarg0;  // source array address
2007     const Register value     = c_rarg1;  // value
2008     const Register count     = c_rarg2;  // elements count
2009 
2010     const Register bz_base   = x28;      // base for block_zero routine
2011     const Register cnt_words = x29;      // temp register
2012     const Register tmp_reg   = t1;
2013 
2014     __ enter();
2015 
2016     Label L_fill_elements, L_exit1;
2017 
2018     int shift = -1;
2019     switch (t) {
2020       case T_BYTE:
2021         shift = 0;
2022 
2023         // Zero extend value
2024         // 8 bit -> 16 bit
2025         __ andi(value, value, 0xff);
2026         __ mv(tmp_reg, value);
2027         __ slli(tmp_reg, tmp_reg, 8);
2028         __ orr(value, value, tmp_reg);
2029 
2030         // 16 bit -> 32 bit
2031         __ mv(tmp_reg, value);
2032         __ slli(tmp_reg, tmp_reg, 16);
2033         __ orr(value, value, tmp_reg);
2034 
2035         __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2036         __ bltu(count, tmp_reg, L_fill_elements);
2037         break;
2038       case T_SHORT:
2039         shift = 1;
2040         // Zero extend value
2041         // 16 bit -> 32 bit
2042         __ andi(value, value, 0xffff);
2043         __ mv(tmp_reg, value);
2044         __ slli(tmp_reg, tmp_reg, 16);
2045         __ orr(value, value, tmp_reg);
2046 
2047         // Short arrays (< 8 bytes) fill by element
2048         __ mv(tmp_reg, 8 >> shift);
2049         __ bltu(count, tmp_reg, L_fill_elements);
2050         break;
2051       case T_INT:
2052         shift = 2;
2053 
2054         // Short arrays (< 8 bytes) fill by element
2055         __ mv(tmp_reg, 8 >> shift);
2056         __ bltu(count, tmp_reg, L_fill_elements);
2057         break;
2058       default: ShouldNotReachHere();
2059     }
2060 
2061     // Align source address at 8 bytes address boundary.
2062     Label L_skip_align1, L_skip_align2, L_skip_align4;
2063     if (!aligned) {
2064       switch (t) {
2065         case T_BYTE:
2066           // One byte misalignment happens only for byte arrays.
2067           __ test_bit(t0, to, 0);
2068           __ beqz(t0, L_skip_align1);
2069           __ sb(value, Address(to, 0));
2070           __ addi(to, to, 1);
2071           __ addiw(count, count, -1);
2072           __ bind(L_skip_align1);
2073           // Fallthrough
2074         case T_SHORT:
2075           // Two bytes misalignment happens only for byte and short (char) arrays.
2076           __ test_bit(t0, to, 1);
2077           __ beqz(t0, L_skip_align2);
2078           __ sh(value, Address(to, 0));
2079           __ addi(to, to, 2);
2080           __ addiw(count, count, -(2 >> shift));
2081           __ bind(L_skip_align2);
2082           // Fallthrough
2083         case T_INT:
2084           // Align to 8 bytes, we know we are 4 byte aligned to start.
2085           __ test_bit(t0, to, 2);
2086           __ beqz(t0, L_skip_align4);
2087           __ sw(value, Address(to, 0));
2088           __ addi(to, to, 4);
2089           __ addiw(count, count, -(4 >> shift));
2090           __ bind(L_skip_align4);
2091           break;
2092         default: ShouldNotReachHere();
2093       }
2094     }
2095 
2096     //
2097     //  Fill large chunks
2098     //
2099     __ srliw(cnt_words, count, 3 - shift); // number of words
2100 
2101     // 32 bit -> 64 bit
2102     __ andi(value, value, 0xffffffff);
2103     __ mv(tmp_reg, value);
2104     __ slli(tmp_reg, tmp_reg, 32);
2105     __ orr(value, value, tmp_reg);
2106 
2107     __ slli(tmp_reg, cnt_words, 3 - shift);
2108     __ subw(count, count, tmp_reg);
2109     {
2110       __ fill_words(to, cnt_words, value);
2111     }
2112 
2113     // Remaining count is less than 8 bytes. Fill it by a single store.
2114     // Note that the total length is no less than 8 bytes.
2115     if (t == T_BYTE || t == T_SHORT) {
2116       __ beqz(count, L_exit1);
2117       __ shadd(to, count, to, tmp_reg, shift); // points to the end
2118       __ sd(value, Address(to, -8)); // overwrite some elements
2119       __ bind(L_exit1);
2120       __ leave();
2121       __ ret();
2122     }
2123 
2124     // Handle copies less than 8 bytes.
2125     Label L_fill_2, L_fill_4, L_exit2;
2126     __ bind(L_fill_elements);
2127     switch (t) {
2128       case T_BYTE:
2129         __ test_bit(t0, count, 0);
2130         __ beqz(t0, L_fill_2);
2131         __ sb(value, Address(to, 0));
2132         __ addi(to, to, 1);
2133         __ bind(L_fill_2);
2134         __ test_bit(t0, count, 1);
2135         __ beqz(t0, L_fill_4);
2136         __ sh(value, Address(to, 0));
2137         __ addi(to, to, 2);
2138         __ bind(L_fill_4);
2139         __ test_bit(t0, count, 2);
2140         __ beqz(t0, L_exit2);
2141         __ sw(value, Address(to, 0));
2142         break;
2143       case T_SHORT:
2144         __ test_bit(t0, count, 0);
2145         __ beqz(t0, L_fill_4);
2146         __ sh(value, Address(to, 0));
2147         __ addi(to, to, 2);
2148         __ bind(L_fill_4);
2149         __ test_bit(t0, count, 1);
2150         __ beqz(t0, L_exit2);
2151         __ sw(value, Address(to, 0));
2152         break;
2153       case T_INT:
2154         __ beqz(count, L_exit2);
2155         __ sw(value, Address(to, 0));
2156         break;
2157       default: ShouldNotReachHere();
2158     }
2159     __ bind(L_exit2);
2160     __ leave();
2161     __ ret();
2162     return start;
2163   }
2164 
2165   void generate_arraycopy_stubs() {
2166     address entry                     = nullptr;
2167     address entry_jbyte_arraycopy     = nullptr;
2168     address entry_jshort_arraycopy    = nullptr;
2169     address entry_jint_arraycopy      = nullptr;
2170     address entry_oop_arraycopy       = nullptr;
2171     address entry_jlong_arraycopy     = nullptr;
2172     address entry_checkcast_arraycopy = nullptr;
2173 
2174     generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards);
2175     generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards);
2176 
2177     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2178 
2179     //*** jbyte
2180     // Always need aligned and unaligned versions
2181     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, &entry,
2182                                                                                    "jbyte_disjoint_arraycopy");
2183     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_byte_copy(false, entry,
2184                                                                                    &entry_jbyte_arraycopy,
2185                                                                                    "jbyte_arraycopy");
2186     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, &entry,
2187                                                                                    "arrayof_jbyte_disjoint_arraycopy");
2188     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_byte_copy(true, entry, nullptr,
2189                                                                                    "arrayof_jbyte_arraycopy");
2190 
2191     //*** jshort
2192     // Always need aligned and unaligned versions
2193     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2194                                                                                     "jshort_disjoint_arraycopy");
2195     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2196                                                                                     &entry_jshort_arraycopy,
2197                                                                                     "jshort_arraycopy");
2198     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2199                                                                                     "arrayof_jshort_disjoint_arraycopy");
2200     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2201                                                                                     "arrayof_jshort_arraycopy");
2202 
2203     //*** jint
2204     // Aligned versions
2205     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_copy(true, &entry,
2206                                                                                   "arrayof_jint_disjoint_arraycopy");
2207     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2208                                                                                   "arrayof_jint_arraycopy");
2209     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2210     // entry_jint_arraycopy always points to the unaligned version
2211     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_copy(false, &entry,
2212                                                                                   "jint_disjoint_arraycopy");
2213     StubRoutines::_jint_arraycopy                    = generate_conjoint_int_copy(false, entry,
2214                                                                                   &entry_jint_arraycopy,
2215                                                                                   "jint_arraycopy");
2216 
2217     //*** jlong
2218     // It is always aligned
2219     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, &entry,
2220                                                                                    "arrayof_jlong_disjoint_arraycopy");
2221     StubRoutines::_arrayof_jlong_arraycopy           = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2222                                                                                    "arrayof_jlong_arraycopy");
2223     StubRoutines::_jlong_disjoint_arraycopy          = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2224     StubRoutines::_jlong_arraycopy                   = StubRoutines::_arrayof_jlong_arraycopy;
2225 
2226     //*** oops
2227     {
2228       // With compressed oops we need unaligned versions; notice that
2229       // we overwrite entry_oop_arraycopy.
2230       bool aligned = !UseCompressedOops;
2231 
2232       StubRoutines::_arrayof_oop_disjoint_arraycopy
2233         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2234                                      /*dest_uninitialized*/false);
2235       StubRoutines::_arrayof_oop_arraycopy
2236         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2237                                      /*dest_uninitialized*/false);
2238       // Aligned versions without pre-barriers
2239       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2240         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2241                                      /*dest_uninitialized*/true);
2242       StubRoutines::_arrayof_oop_arraycopy_uninit
2243         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2244                                      /*dest_uninitialized*/true);
2245     }
2246 
2247     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2248     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2249     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2250     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2251 
2252     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2253     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2254                                                                         /*dest_uninitialized*/true);
2255 
2256 
2257     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2258                                                               entry_jbyte_arraycopy,
2259                                                               entry_jshort_arraycopy,
2260                                                               entry_jint_arraycopy,
2261                                                               entry_jlong_arraycopy);
2262 
2263     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2264                                                                entry_jbyte_arraycopy,
2265                                                                entry_jshort_arraycopy,
2266                                                                entry_jint_arraycopy,
2267                                                                entry_oop_arraycopy,
2268                                                                entry_jlong_arraycopy,
2269                                                                entry_checkcast_arraycopy);
2270 
2271     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2272     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2273     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2274     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2275     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2276     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2277   }
2278 
2279   // code for comparing 16 bytes of strings with same encoding
2280   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
2281     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31;
2282     __ ld(tmp5, Address(str1));
2283     __ addi(str1, str1, 8);
2284     __ xorr(tmp4, tmp1, tmp2);
2285     __ ld(cnt1, Address(str2));
2286     __ addi(str2, str2, 8);
2287     __ bnez(tmp4, DIFF1);
2288     __ ld(tmp1, Address(str1));
2289     __ addi(str1, str1, 8);
2290     __ xorr(tmp4, tmp5, cnt1);
2291     __ ld(tmp2, Address(str2));
2292     __ addi(str2, str2, 8);
2293     __ bnez(tmp4, DIFF2);
2294   }
2295 
2296   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
2297   void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) {
2298     const Register tmp = x30, tmpLval = x12;
2299     __ ld(tmpLval, Address(strL));
2300     __ addi(strL, strL, wordSize);
2301     __ ld(tmpU, Address(strU));
2302     __ addi(strU, strU, wordSize);
2303     __ inflate_lo32(tmpL, tmpLval);
2304     __ xorr(tmp, tmpU, tmpL);
2305     __ bnez(tmp, DIFF);
2306 
2307     __ ld(tmpU, Address(strU));
2308     __ addi(strU, strU, wordSize);
2309     __ inflate_hi32(tmpL, tmpLval);
2310     __ xorr(tmp, tmpU, tmpL);
2311     __ bnez(tmp, DIFF);
2312   }
2313 
2314   // x10  = result
2315   // x11  = str1
2316   // x12  = cnt1
2317   // x13  = str2
2318   // x14  = cnt2
2319   // x28  = tmp1
2320   // x29  = tmp2
2321   // x30  = tmp3
2322   address generate_compare_long_string_different_encoding(bool isLU) {
2323     __ align(CodeEntryAlignment);
2324     StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL");
2325     address entry = __ pc();
2326     Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
2327     const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
2328                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
2329 
2330     // cnt2 == amount of characters left to compare
2331     // Check already loaded first 4 symbols
2332     __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2);
2333     __ mv(isLU ? tmp1 : tmp2, tmp3);
2334     __ addi(str1, str1, isLU ? wordSize / 2 : wordSize);
2335     __ addi(str2, str2, isLU ? wordSize : wordSize / 2);
2336     __ sub(cnt2, cnt2, wordSize / 2); // Already loaded 4 symbols
2337 
2338     __ xorr(tmp3, tmp1, tmp2);
2339     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2340 
2341     Register strU = isLU ? str2 : str1,
2342              strL = isLU ? str1 : str2,
2343              tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
2344              tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
2345 
2346     // make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL
2347     // cnt2 is >= 68 here, no need to check it for >= 0
2348     __ lwu(tmpL, Address(strL));
2349     __ addi(strL, strL, wordSize / 2);
2350     __ ld(tmpU, Address(strU));
2351     __ addi(strU, strU, wordSize);
2352     __ inflate_lo32(tmp3, tmpL);
2353     __ mv(tmpL, tmp3);
2354     __ xorr(tmp3, tmpU, tmpL);
2355     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2356     __ addi(cnt2, cnt2, -wordSize / 2);
2357 
2358     // we are now 8-bytes aligned on strL
2359     __ sub(cnt2, cnt2, wordSize * 2);
2360     __ bltz(cnt2, TAIL);
2361     __ bind(SMALL_LOOP); // smaller loop
2362       __ sub(cnt2, cnt2, wordSize * 2);
2363       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2364       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2365       __ bgez(cnt2, SMALL_LOOP);
2366       __ addi(t0, cnt2, wordSize * 2);
2367       __ beqz(t0, DONE);
2368     __ bind(TAIL);  // 1..15 characters left
2369       // Aligned access. Load bytes in portions - 4, 2, 1.
2370 
2371       __ addi(t0, cnt2, wordSize);
2372       __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
2373       __ bltz(t0, LOAD_LAST);
2374       // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
2375       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2376       __ addi(cnt2, cnt2, -wordSize);
2377       __ beqz(cnt2, DONE);  // no character left
2378       __ bind(LOAD_LAST);   // cnt2 = 1..7 characters left
2379 
2380       __ addi(cnt2, cnt2, -wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
2381       __ slli(t0, cnt2, 1);     // t0 is now an offset in strU which points to last 16 bytes
2382       __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
2383       __ add(strU, strU, t0);   // Address of last 16 bytes in UTF-16 string
2384       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2385       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2386       __ inflate_lo32(tmp3, tmpL);
2387       __ mv(tmpL, tmp3);
2388       __ xorr(tmp3, tmpU, tmpL);
2389       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2390 
2391       __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
2392       __ addi(strU, strU, wordSize);   // Address of last 8 bytes in UTF-16 string
2393       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2394       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2395       __ inflate_lo32(tmp3, tmpL);
2396       __ mv(tmpL, tmp3);
2397       __ xorr(tmp3, tmpU, tmpL);
2398       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2399       __ j(DONE); // no character left
2400 
2401       // Find the first different characters in the longwords and
2402       // compute their difference.
2403     __ bind(CALCULATE_DIFFERENCE);
2404       __ ctzc_bit(tmp4, tmp3);
2405       __ srl(tmp1, tmp1, tmp4);
2406       __ srl(tmp2, tmp2, tmp4);
2407       __ andi(tmp1, tmp1, 0xFFFF);
2408       __ andi(tmp2, tmp2, 0xFFFF);
2409       __ sub(result, tmp1, tmp2);
2410     __ bind(DONE);
2411       __ ret();
2412     return entry;
2413   }
2414 
2415   address generate_method_entry_barrier() {
2416     __ align(CodeEntryAlignment);
2417     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
2418 
2419     Label deoptimize_label;
2420 
2421     address start = __ pc();
2422 
2423     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2424 
2425     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
2426       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
2427       Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
2428       __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
2429       __ lwu(t1, t1);
2430       __ sw(t1, thread_epoch_addr);
2431       // There are two ways this can work:
2432       // - The writer did system icache shootdown after the instruction stream update.
2433       //   Hence do nothing.
2434       // - The writer trust us to make sure our icache is in sync before entering.
2435       //   Hence use cmodx fence (fence.i, may change).
2436       if (UseCtxFencei) {
2437         __ cmodx_fence();
2438       }
2439       __ membar(__ LoadLoad);
2440     }
2441 
2442     __ set_last_Java_frame(sp, fp, ra);
2443 
2444     __ enter();
2445     __ add(t1, sp, wordSize);
2446 
2447     __ sub(sp, sp, 4 * wordSize);
2448 
2449     __ push_call_clobbered_registers();
2450 
2451     __ mv(c_rarg0, t1);
2452     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
2453 
2454     __ reset_last_Java_frame(true);
2455 
2456     __ mv(t0, x10);
2457 
2458     __ pop_call_clobbered_registers();
2459 
2460     __ bnez(t0, deoptimize_label);
2461 
2462     __ leave();
2463     __ ret();
2464 
2465     __ BIND(deoptimize_label);
2466 
2467     __ ld(t0, Address(sp, 0));
2468     __ ld(fp, Address(sp, wordSize));
2469     __ ld(ra, Address(sp, wordSize * 2));
2470     __ ld(t1, Address(sp, wordSize * 3));
2471 
2472     __ mv(sp, t0);
2473     __ jr(t1);
2474 
2475     return start;
2476   }
2477 
2478   // x10  = result
2479   // x11  = str1
2480   // x12  = cnt1
2481   // x13  = str2
2482   // x14  = cnt2
2483   // x28  = tmp1
2484   // x29  = tmp2
2485   // x30  = tmp3
2486   // x31  = tmp4
2487   address generate_compare_long_string_same_encoding(bool isLL) {
2488     __ align(CodeEntryAlignment);
2489     StubCodeMark mark(this, "StubRoutines", isLL ?
2490                       "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU");
2491     address entry = __ pc();
2492     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
2493           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
2494     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2495                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2496     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2497 
2498     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
2499     // update cnt2 counter with already loaded 8 bytes
2500     __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
2501     // update pointers, because of previous read
2502     __ add(str1, str1, wordSize);
2503     __ add(str2, str2, wordSize);
2504     // less than 16 bytes left?
2505     __ sub(cnt2, cnt2, isLL ? 16 : 8);
2506     __ push_reg(spilled_regs, sp);
2507     __ bltz(cnt2, TAIL);
2508     __ bind(SMALL_LOOP);
2509       compare_string_16_bytes_same(DIFF, DIFF2);
2510       __ sub(cnt2, cnt2, isLL ? 16 : 8);
2511       __ bgez(cnt2, SMALL_LOOP);
2512     __ bind(TAIL);
2513       __ addi(cnt2, cnt2, isLL ? 16 : 8);
2514       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
2515       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2516       __ blez(cnt2, CHECK_LAST);
2517       __ xorr(tmp4, tmp1, tmp2);
2518       __ bnez(tmp4, DIFF);
2519       __ ld(tmp1, Address(str1));
2520       __ addi(str1, str1, 8);
2521       __ ld(tmp2, Address(str2));
2522       __ addi(str2, str2, 8);
2523       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2524     __ bind(CHECK_LAST);
2525       if (!isLL) {
2526         __ add(cnt2, cnt2, cnt2); // now in bytes
2527       }
2528       __ xorr(tmp4, tmp1, tmp2);
2529       __ bnez(tmp4, DIFF);
2530       __ add(str1, str1, cnt2);
2531       __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
2532       __ add(str2, str2, cnt2);
2533       __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
2534       __ xorr(tmp4, tmp5, cnt1);
2535       __ beqz(tmp4, LENGTH_DIFF);
2536       // Find the first different characters in the longwords and
2537       // compute their difference.
2538     __ bind(DIFF2);
2539       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2540       __ srl(tmp5, tmp5, tmp3);
2541       __ srl(cnt1, cnt1, tmp3);
2542       if (isLL) {
2543         __ andi(tmp5, tmp5, 0xFF);
2544         __ andi(cnt1, cnt1, 0xFF);
2545       } else {
2546         __ andi(tmp5, tmp5, 0xFFFF);
2547         __ andi(cnt1, cnt1, 0xFFFF);
2548       }
2549       __ sub(result, tmp5, cnt1);
2550       __ j(LENGTH_DIFF);
2551     __ bind(DIFF);
2552       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2553       __ srl(tmp1, tmp1, tmp3);
2554       __ srl(tmp2, tmp2, tmp3);
2555       if (isLL) {
2556         __ andi(tmp1, tmp1, 0xFF);
2557         __ andi(tmp2, tmp2, 0xFF);
2558       } else {
2559         __ andi(tmp1, tmp1, 0xFFFF);
2560         __ andi(tmp2, tmp2, 0xFFFF);
2561       }
2562       __ sub(result, tmp1, tmp2);
2563       __ j(LENGTH_DIFF);
2564     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
2565       __ xorr(tmp4, tmp1, tmp2);
2566       __ bnez(tmp4, DIFF);
2567     __ bind(LENGTH_DIFF);
2568       __ pop_reg(spilled_regs, sp);
2569       __ ret();
2570     return entry;
2571   }
2572 
2573   void generate_compare_long_strings() {
2574     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true);
2575     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false);
2576     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true);
2577     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false);
2578   }
2579 
2580   // x10 result
2581   // x11 src
2582   // x12 src count
2583   // x13 pattern
2584   // x14 pattern count
2585   address generate_string_indexof_linear(bool needle_isL, bool haystack_isL)
2586   {
2587     const char* stubName = needle_isL
2588            ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul")
2589            : "indexof_linear_uu";
2590     __ align(CodeEntryAlignment);
2591     StubCodeMark mark(this, "StubRoutines", stubName);
2592     address entry = __ pc();
2593 
2594     int needle_chr_size = needle_isL ? 1 : 2;
2595     int haystack_chr_size = haystack_isL ? 1 : 2;
2596     int needle_chr_shift = needle_isL ? 0 : 1;
2597     int haystack_chr_shift = haystack_isL ? 0 : 1;
2598     bool isL = needle_isL && haystack_isL;
2599     // parameters
2600     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
2601     // temporary registers
2602     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
2603     // redefinitions
2604     Register ch1 = x28, ch2 = x29;
2605     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
2606 
2607     __ push_reg(spilled_regs, sp);
2608 
2609     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
2610           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
2611           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
2612           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
2613           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
2614           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
2615 
2616     __ ld(ch1, Address(needle));
2617     __ ld(ch2, Address(haystack));
2618     // src.length - pattern.length
2619     __ sub(haystack_len, haystack_len, needle_len);
2620 
2621     // first is needle[0]
2622     __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first);
2623     uint64_t mask0101 = UCONST64(0x0101010101010101);
2624     uint64_t mask0001 = UCONST64(0x0001000100010001);
2625     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
2626     __ mul(first, first, mask1);
2627     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
2628     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
2629     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
2630     if (needle_isL != haystack_isL) {
2631       __ mv(tmp, ch1);
2632     }
2633     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
2634     __ blez(haystack_len, L_SMALL);
2635 
2636     if (needle_isL != haystack_isL) {
2637       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2638     }
2639     // xorr, sub, orr, notr, andr
2640     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
2641     // eg:
2642     // first:        aa aa aa aa aa aa aa aa
2643     // ch2:          aa aa li nx jd ka aa aa
2644     // match_mask:   80 80 00 00 00 00 80 80
2645     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2646 
2647     // search first char of needle, if success, goto L_HAS_ZERO;
2648     __ bnez(match_mask, L_HAS_ZERO);
2649     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2650     __ add(result, result, wordSize / haystack_chr_size);
2651     __ add(haystack, haystack, wordSize);
2652     __ bltz(haystack_len, L_POST_LOOP);
2653 
2654     __ bind(L_LOOP);
2655     __ ld(ch2, Address(haystack));
2656     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2657     __ bnez(match_mask, L_HAS_ZERO);
2658 
2659     __ bind(L_LOOP_PROCEED);
2660     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2661     __ add(haystack, haystack, wordSize);
2662     __ add(result, result, wordSize / haystack_chr_size);
2663     __ bgez(haystack_len, L_LOOP);
2664 
2665     __ bind(L_POST_LOOP);
2666     __ mv(ch2, -wordSize / haystack_chr_size);
2667     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
2668     __ ld(ch2, Address(haystack));
2669     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2670     __ neg(haystack_len, haystack_len);
2671     __ xorr(ch2, first, ch2);
2672     __ sub(match_mask, ch2, mask1);
2673     __ orr(ch2, ch2, mask2);
2674     __ mv(trailing_zeros, -1); // all bits set
2675     __ j(L_SMALL_PROCEED);
2676 
2677     __ align(OptoLoopAlignment);
2678     __ bind(L_SMALL);
2679     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2680     __ neg(haystack_len, haystack_len);
2681     if (needle_isL != haystack_isL) {
2682       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2683     }
2684     __ xorr(ch2, first, ch2);
2685     __ sub(match_mask, ch2, mask1);
2686     __ orr(ch2, ch2, mask2);
2687     __ mv(trailing_zeros, -1); // all bits set
2688 
2689     __ bind(L_SMALL_PROCEED);
2690     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
2691     __ notr(ch2, ch2);
2692     __ andr(match_mask, match_mask, ch2);
2693     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
2694     __ beqz(match_mask, NOMATCH);
2695 
2696     __ bind(L_SMALL_HAS_ZERO_LOOP);
2697     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros
2698     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2699     __ mv(ch2, wordSize / haystack_chr_size);
2700     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
2701     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2702     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2703     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2704 
2705     __ bind(L_SMALL_CMP_LOOP);
2706     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
2707     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2708     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
2709     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2710     __ add(trailing_zeros, trailing_zeros, 1);
2711     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
2712     __ beq(first, ch2, L_SMALL_CMP_LOOP);
2713 
2714     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
2715     __ beqz(match_mask, NOMATCH);
2716     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2717     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2718     __ add(result, result, 1);
2719     __ add(haystack, haystack, haystack_chr_size);
2720     __ j(L_SMALL_HAS_ZERO_LOOP);
2721 
2722     __ align(OptoLoopAlignment);
2723     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
2724     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2725     __ j(DONE);
2726 
2727     __ align(OptoLoopAlignment);
2728     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
2729     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2730     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2731     __ j(DONE);
2732 
2733     __ align(OptoLoopAlignment);
2734     __ bind(L_HAS_ZERO);
2735     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2736     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2737     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
2738     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
2739     __ sub(result, result, 1); // array index from 0, so result -= 1
2740 
2741     __ bind(L_HAS_ZERO_LOOP);
2742     __ mv(needle_len, wordSize / haystack_chr_size);
2743     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
2744     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
2745     // load next 8 bytes from haystack, and increase result index
2746     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2747     __ add(result, result, 1);
2748     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2749     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2750 
2751     // compare one char
2752     __ bind(L_CMP_LOOP);
2753     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
2754     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
2755     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2756     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2757     __ add(trailing_zeros, trailing_zeros, 1); // next char index
2758     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
2759     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
2760     __ beq(needle_len, ch2, L_CMP_LOOP);
2761 
2762     __ bind(L_CMP_LOOP_NOMATCH);
2763     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
2764     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index
2765     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2766     __ add(haystack, haystack, haystack_chr_size);
2767     __ j(L_HAS_ZERO_LOOP);
2768 
2769     __ align(OptoLoopAlignment);
2770     __ bind(L_CMP_LOOP_LAST_CMP);
2771     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
2772     __ j(DONE);
2773 
2774     __ align(OptoLoopAlignment);
2775     __ bind(L_CMP_LOOP_LAST_CMP2);
2776     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2777     __ add(result, result, 1);
2778     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2779     __ j(DONE);
2780 
2781     __ align(OptoLoopAlignment);
2782     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
2783     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
2784     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
2785     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
2786     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
2787     // result by analyzed characters value, so, we can just reset lower bits
2788     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
2789     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
2790     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
2791     // index of last analyzed substring inside current octet. So, haystack in at
2792     // respective start address. We need to advance it to next octet
2793     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
2794     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
2795     __ andi(result, result, haystack_isL ? -8 : -4);
2796     __ slli(tmp, match_mask, haystack_chr_shift);
2797     __ sub(haystack, haystack, tmp);
2798     __ sign_extend(haystack_len, haystack_len, 32);
2799     __ j(L_LOOP_PROCEED);
2800 
2801     __ align(OptoLoopAlignment);
2802     __ bind(NOMATCH);
2803     __ mv(result, -1);
2804 
2805     __ bind(DONE);
2806     __ pop_reg(spilled_regs, sp);
2807     __ ret();
2808     return entry;
2809   }
2810 
2811   void generate_string_indexof_stubs()
2812   {
2813     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
2814     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
2815     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
2816   }
2817 
2818 #ifdef COMPILER2
2819   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
2820     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
2821 
2822     address start = __ pc();
2823     const Register
2824       r_super_klass  = x10,
2825       r_array_base   = x11,
2826       r_array_length = x12,
2827       r_array_index  = x13,
2828       r_sub_klass    = x14,
2829       result         = x15,
2830       r_bitmap       = x16;
2831 
2832     Label L_success;
2833     __ enter();
2834     __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, result,
2835                                      r_array_base, r_array_length, r_array_index,
2836                                      r_bitmap, super_klass_index, /*stub_is_near*/true);
2837     __ leave();
2838     __ ret();
2839 
2840     return start;
2841   }
2842 
2843   // Slow path implementation for UseSecondarySupersTable.
2844   address generate_lookup_secondary_supers_table_slow_path_stub() {
2845     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
2846 
2847     address start = __ pc();
2848     const Register
2849       r_super_klass  = x10,        // argument
2850       r_array_base   = x11,        // argument
2851       temp1          = x12,        // tmp
2852       r_array_index  = x13,        // argument
2853       result         = x15,        // argument
2854       r_bitmap       = x16;        // argument
2855 
2856 
2857     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2858     __ ret();
2859 
2860     return start;
2861   }
2862 
2863   address generate_mulAdd()
2864   {
2865     __ align(CodeEntryAlignment);
2866     StubCodeMark mark(this, "StubRoutines", "mulAdd");
2867 
2868     address entry = __ pc();
2869 
2870     const Register out     = x10;
2871     const Register in      = x11;
2872     const Register offset  = x12;
2873     const Register len     = x13;
2874     const Register k       = x14;
2875     const Register tmp     = x28;
2876 
2877     BLOCK_COMMENT("Entry:");
2878     __ enter();
2879     __ mul_add(out, in, offset, len, k, tmp);
2880     __ leave();
2881     __ ret();
2882 
2883     return entry;
2884   }
2885 
2886   /**
2887    *  Arguments:
2888    *
2889    *  Input:
2890    *    c_rarg0   - x address
2891    *    c_rarg1   - x length
2892    *    c_rarg2   - y address
2893    *    c_rarg3   - y length
2894    *    c_rarg4   - z address
2895    */
2896   address generate_multiplyToLen()
2897   {
2898     __ align(CodeEntryAlignment);
2899     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2900     address entry = __ pc();
2901 
2902     const Register x     = x10;
2903     const Register xlen  = x11;
2904     const Register y     = x12;
2905     const Register ylen  = x13;
2906     const Register z     = x14;
2907 
2908     const Register tmp0  = x15;
2909     const Register tmp1  = x16;
2910     const Register tmp2  = x17;
2911     const Register tmp3  = x7;
2912     const Register tmp4  = x28;
2913     const Register tmp5  = x29;
2914     const Register tmp6  = x30;
2915     const Register tmp7  = x31;
2916 
2917     BLOCK_COMMENT("Entry:");
2918     __ enter(); // required for proper stackwalking of RuntimeStub frame
2919     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2920     __ leave(); // required for proper stackwalking of RuntimeStub frame
2921     __ ret();
2922 
2923     return entry;
2924   }
2925 
2926   address generate_squareToLen()
2927   {
2928     __ align(CodeEntryAlignment);
2929     StubCodeMark mark(this, "StubRoutines", "squareToLen");
2930     address entry = __ pc();
2931 
2932     const Register x     = x10;
2933     const Register xlen  = x11;
2934     const Register z     = x12;
2935     const Register y     = x14; // == x
2936     const Register ylen  = x15; // == xlen
2937 
2938     const Register tmp0  = x13; // zlen, unused
2939     const Register tmp1  = x16;
2940     const Register tmp2  = x17;
2941     const Register tmp3  = x7;
2942     const Register tmp4  = x28;
2943     const Register tmp5  = x29;
2944     const Register tmp6  = x30;
2945     const Register tmp7  = x31;
2946 
2947     BLOCK_COMMENT("Entry:");
2948     __ enter();
2949     __ mv(y, x);
2950     __ mv(ylen, xlen);
2951     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2952     __ leave();
2953     __ ret();
2954 
2955     return entry;
2956   }
2957 
2958   // Arguments:
2959   //
2960   // Input:
2961   //   c_rarg0   - newArr address
2962   //   c_rarg1   - oldArr address
2963   //   c_rarg2   - newIdx
2964   //   c_rarg3   - shiftCount
2965   //   c_rarg4   - numIter
2966   //
2967   address generate_bigIntegerLeftShift() {
2968     __ align(CodeEntryAlignment);
2969     StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
2970     address entry = __ pc();
2971 
2972     Label loop, exit;
2973 
2974     Register newArr        = c_rarg0;
2975     Register oldArr        = c_rarg1;
2976     Register newIdx        = c_rarg2;
2977     Register shiftCount    = c_rarg3;
2978     Register numIter       = c_rarg4;
2979 
2980     Register shiftRevCount = c_rarg5;
2981     Register oldArrNext    = t1;
2982 
2983     __ beqz(numIter, exit);
2984     __ shadd(newArr, newIdx, newArr, t0, 2);
2985 
2986     __ mv(shiftRevCount, 32);
2987     __ sub(shiftRevCount, shiftRevCount, shiftCount);
2988 
2989     __ bind(loop);
2990     __ addi(oldArrNext, oldArr, 4);
2991     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
2992     __ vle32_v(v0, oldArr);
2993     __ vle32_v(v4, oldArrNext);
2994     __ vsll_vx(v0, v0, shiftCount);
2995     __ vsrl_vx(v4, v4, shiftRevCount);
2996     __ vor_vv(v0, v0, v4);
2997     __ vse32_v(v0, newArr);
2998     __ sub(numIter, numIter, t0);
2999     __ shadd(oldArr, t0, oldArr, t1, 2);
3000     __ shadd(newArr, t0, newArr, t1, 2);
3001     __ bnez(numIter, loop);
3002 
3003     __ bind(exit);
3004     __ ret();
3005 
3006     return entry;
3007   }
3008 
3009   // Arguments:
3010   //
3011   // Input:
3012   //   c_rarg0   - newArr address
3013   //   c_rarg1   - oldArr address
3014   //   c_rarg2   - newIdx
3015   //   c_rarg3   - shiftCount
3016   //   c_rarg4   - numIter
3017   //
3018   address generate_bigIntegerRightShift() {
3019     __ align(CodeEntryAlignment);
3020     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
3021     address entry = __ pc();
3022 
3023     Label loop, exit;
3024 
3025     Register newArr        = c_rarg0;
3026     Register oldArr        = c_rarg1;
3027     Register newIdx        = c_rarg2;
3028     Register shiftCount    = c_rarg3;
3029     Register numIter       = c_rarg4;
3030     Register idx           = numIter;
3031 
3032     Register shiftRevCount = c_rarg5;
3033     Register oldArrNext    = c_rarg6;
3034     Register newArrCur     = t0;
3035     Register oldArrCur     = t1;
3036 
3037     __ beqz(idx, exit);
3038     __ shadd(newArr, newIdx, newArr, t0, 2);
3039 
3040     __ mv(shiftRevCount, 32);
3041     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3042 
3043     __ bind(loop);
3044     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3045     __ sub(idx, idx, t0);
3046     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3047     __ shadd(newArrCur, idx, newArr, t1, 2);
3048     __ addi(oldArrCur, oldArrNext, 4);
3049     __ vle32_v(v0, oldArrCur);
3050     __ vle32_v(v4, oldArrNext);
3051     __ vsrl_vx(v0, v0, shiftCount);
3052     __ vsll_vx(v4, v4, shiftRevCount);
3053     __ vor_vv(v0, v0, v4);
3054     __ vse32_v(v0, newArrCur);
3055     __ bnez(idx, loop);
3056 
3057     __ bind(exit);
3058     __ ret();
3059 
3060     return entry;
3061   }
3062 #endif
3063 
3064 #ifdef COMPILER2
3065   class MontgomeryMultiplyGenerator : public MacroAssembler {
3066 
3067     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3068       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3069 
3070     RegSet _toSave;
3071     bool _squaring;
3072 
3073   public:
3074     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3075       : MacroAssembler(as->code()), _squaring(squaring) {
3076 
3077       // Register allocation
3078 
3079       RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3080       Pa_base = *regs;       // Argument registers
3081       if (squaring) {
3082         Pb_base = Pa_base;
3083       } else {
3084         Pb_base = *++regs;
3085       }
3086       Pn_base = *++regs;
3087       Rlen= *++regs;
3088       inv = *++regs;
3089       Pm_base = *++regs;
3090 
3091                         // Working registers:
3092       Ra =  *++regs;    // The current digit of a, b, n, and m.
3093       Rb =  *++regs;
3094       Rm =  *++regs;
3095       Rn =  *++regs;
3096 
3097       Pa =  *++regs;      // Pointers to the current/next digit of a, b, n, and m.
3098       Pb =  *++regs;
3099       Pm =  *++regs;
3100       Pn =  *++regs;
3101 
3102       tmp0 =  *++regs;    // Three registers which form a
3103       tmp1 =  *++regs;    // triple-precision accumuator.
3104       tmp2 =  *++regs;
3105 
3106       Ri =  x6;         // Inner and outer loop indexes.
3107       Rj =  x7;
3108 
3109       Rhi_ab = x28;     // Product registers: low and high parts
3110       Rlo_ab = x29;     // of a*b and m*n.
3111       Rhi_mn = x30;
3112       Rlo_mn = x31;
3113 
3114       // x18 and up are callee-saved.
3115       _toSave = RegSet::range(x18, *regs) + Pm_base;
3116     }
3117 
3118   private:
3119     void save_regs() {
3120       push_reg(_toSave, sp);
3121     }
3122 
3123     void restore_regs() {
3124       pop_reg(_toSave, sp);
3125     }
3126 
3127     template <typename T>
3128     void unroll_2(Register count, T block) {
3129       Label loop, end, odd;
3130       beqz(count, end);
3131       test_bit(t0, count, 0);
3132       bnez(t0, odd);
3133       align(16);
3134       bind(loop);
3135       (this->*block)();
3136       bind(odd);
3137       (this->*block)();
3138       addi(count, count, -2);
3139       bgtz(count, loop);
3140       bind(end);
3141     }
3142 
3143     template <typename T>
3144     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3145       Label loop, end, odd;
3146       beqz(count, end);
3147       test_bit(tmp, count, 0);
3148       bnez(tmp, odd);
3149       align(16);
3150       bind(loop);
3151       (this->*block)(d, s, tmp);
3152       bind(odd);
3153       (this->*block)(d, s, tmp);
3154       addi(count, count, -2);
3155       bgtz(count, loop);
3156       bind(end);
3157     }
3158 
3159     void pre1(RegisterOrConstant i) {
3160       block_comment("pre1");
3161       // Pa = Pa_base;
3162       // Pb = Pb_base + i;
3163       // Pm = Pm_base;
3164       // Pn = Pn_base + i;
3165       // Ra = *Pa;
3166       // Rb = *Pb;
3167       // Rm = *Pm;
3168       // Rn = *Pn;
3169       if (i.is_register()) {
3170         slli(t0, i.as_register(), LogBytesPerWord);
3171       } else {
3172         mv(t0, i.as_constant());
3173         slli(t0, t0, LogBytesPerWord);
3174       }
3175 
3176       mv(Pa, Pa_base);
3177       add(Pb, Pb_base, t0);
3178       mv(Pm, Pm_base);
3179       add(Pn, Pn_base, t0);
3180 
3181       ld(Ra, Address(Pa));
3182       ld(Rb, Address(Pb));
3183       ld(Rm, Address(Pm));
3184       ld(Rn, Address(Pn));
3185 
3186       // Zero the m*n result.
3187       mv(Rhi_mn, zr);
3188       mv(Rlo_mn, zr);
3189     }
3190 
3191     // The core multiply-accumulate step of a Montgomery
3192     // multiplication.  The idea is to schedule operations as a
3193     // pipeline so that instructions with long latencies (loads and
3194     // multiplies) have time to complete before their results are
3195     // used.  This most benefits in-order implementations of the
3196     // architecture but out-of-order ones also benefit.
3197     void step() {
3198       block_comment("step");
3199       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3200       // Ra = *++Pa;
3201       // Rb = *--Pb;
3202       mulhu(Rhi_ab, Ra, Rb);
3203       mul(Rlo_ab, Ra, Rb);
3204       addi(Pa, Pa, wordSize);
3205       ld(Ra, Address(Pa));
3206       addi(Pb, Pb, -wordSize);
3207       ld(Rb, Address(Pb));
3208       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
3209                                             // previous iteration.
3210       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3211       // Rm = *++Pm;
3212       // Rn = *--Pn;
3213       mulhu(Rhi_mn, Rm, Rn);
3214       mul(Rlo_mn, Rm, Rn);
3215       addi(Pm, Pm, wordSize);
3216       ld(Rm, Address(Pm));
3217       addi(Pn, Pn, -wordSize);
3218       ld(Rn, Address(Pn));
3219       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3220     }
3221 
3222     void post1() {
3223       block_comment("post1");
3224 
3225       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3226       // Ra = *++Pa;
3227       // Rb = *--Pb;
3228       mulhu(Rhi_ab, Ra, Rb);
3229       mul(Rlo_ab, Ra, Rb);
3230       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3231       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3232 
3233       // *Pm = Rm = tmp0 * inv;
3234       mul(Rm, tmp0, inv);
3235       sd(Rm, Address(Pm));
3236 
3237       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3238       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3239       mulhu(Rhi_mn, Rm, Rn);
3240 
3241 #ifndef PRODUCT
3242       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3243       {
3244         mul(Rlo_mn, Rm, Rn);
3245         add(Rlo_mn, tmp0, Rlo_mn);
3246         Label ok;
3247         beqz(Rlo_mn, ok);
3248         stop("broken Montgomery multiply");
3249         bind(ok);
3250       }
3251 #endif
3252       // We have very carefully set things up so that
3253       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3254       // the lower half of Rm * Rn because we know the result already:
3255       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3256       // tmp0 != 0.  So, rather than do a mul and an cad we just set
3257       // the carry flag iff tmp0 is nonzero.
3258       //
3259       // mul(Rlo_mn, Rm, Rn);
3260       // cad(zr, tmp0, Rlo_mn);
3261       addi(t0, tmp0, -1);
3262       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3263       cadc(tmp0, tmp1, Rhi_mn, t0);
3264       adc(tmp1, tmp2, zr, t0);
3265       mv(tmp2, zr);
3266     }
3267 
3268     void pre2(Register i, Register len) {
3269       block_comment("pre2");
3270       // Pa = Pa_base + i-len;
3271       // Pb = Pb_base + len;
3272       // Pm = Pm_base + i-len;
3273       // Pn = Pn_base + len;
3274 
3275       sub(Rj, i, len);
3276       // Rj == i-len
3277 
3278       // Ra as temp register
3279       slli(Ra, Rj, LogBytesPerWord);
3280       add(Pa, Pa_base, Ra);
3281       add(Pm, Pm_base, Ra);
3282       slli(Ra, len, LogBytesPerWord);
3283       add(Pb, Pb_base, Ra);
3284       add(Pn, Pn_base, Ra);
3285 
3286       // Ra = *++Pa;
3287       // Rb = *--Pb;
3288       // Rm = *++Pm;
3289       // Rn = *--Pn;
3290       add(Pa, Pa, wordSize);
3291       ld(Ra, Address(Pa));
3292       add(Pb, Pb, -wordSize);
3293       ld(Rb, Address(Pb));
3294       add(Pm, Pm, wordSize);
3295       ld(Rm, Address(Pm));
3296       add(Pn, Pn, -wordSize);
3297       ld(Rn, Address(Pn));
3298 
3299       mv(Rhi_mn, zr);
3300       mv(Rlo_mn, zr);
3301     }
3302 
3303     void post2(Register i, Register len) {
3304       block_comment("post2");
3305       sub(Rj, i, len);
3306 
3307       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
3308 
3309       // As soon as we know the least significant digit of our result,
3310       // store it.
3311       // Pm_base[i-len] = tmp0;
3312       // Rj as temp register
3313       slli(Rj, Rj, LogBytesPerWord);
3314       add(Rj, Pm_base, Rj);
3315       sd(tmp0, Address(Rj));
3316 
3317       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3318       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
3319       adc(tmp1, tmp2, zr, t0);
3320       mv(tmp2, zr);
3321     }
3322 
3323     // A carry in tmp0 after Montgomery multiplication means that we
3324     // should subtract multiples of n from our result in m.  We'll
3325     // keep doing that until there is no carry.
3326     void normalize(Register len) {
3327       block_comment("normalize");
3328       // while (tmp0)
3329       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
3330       Label loop, post, again;
3331       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
3332       beqz(tmp0, post); {
3333         bind(again); {
3334           mv(i, zr);
3335           mv(cnt, len);
3336           slli(Rn, i, LogBytesPerWord);
3337           add(Rm, Pm_base, Rn);
3338           ld(Rm, Address(Rm));
3339           add(Rn, Pn_base, Rn);
3340           ld(Rn, Address(Rn));
3341           mv(t0, 1); // set carry flag, i.e. no borrow
3342           align(16);
3343           bind(loop); {
3344             notr(Rn, Rn);
3345             add(Rm, Rm, t0);
3346             add(Rm, Rm, Rn);
3347             sltu(t0, Rm, Rn);
3348             slli(Rn, i, LogBytesPerWord); // Rn as temp register
3349             add(Rn, Pm_base, Rn);
3350             sd(Rm, Address(Rn));
3351             add(i, i, 1);
3352             slli(Rn, i, LogBytesPerWord);
3353             add(Rm, Pm_base, Rn);
3354             ld(Rm, Address(Rm));
3355             add(Rn, Pn_base, Rn);
3356             ld(Rn, Address(Rn));
3357             sub(cnt, cnt, 1);
3358           } bnez(cnt, loop);
3359           addi(tmp0, tmp0, -1);
3360           add(tmp0, tmp0, t0);
3361         } bnez(tmp0, again);
3362       } bind(post);
3363     }
3364 
3365     // Move memory at s to d, reversing words.
3366     //    Increments d to end of copied memory
3367     //    Destroys tmp1, tmp2
3368     //    Preserves len
3369     //    Leaves s pointing to the address which was in d at start
3370     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3371       assert(tmp1->encoding() < x28->encoding(), "register corruption");
3372       assert(tmp2->encoding() < x28->encoding(), "register corruption");
3373 
3374       shadd(s, len, s, tmp1, LogBytesPerWord);
3375       mv(tmp1, len);
3376       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3377       slli(tmp1, len, LogBytesPerWord);
3378       sub(s, d, tmp1);
3379     }
3380     // [63...0] -> [31...0][63...32]
3381     void reverse1(Register d, Register s, Register tmp) {
3382       addi(s, s, -wordSize);
3383       ld(tmp, Address(s));
3384       ror_imm(tmp, tmp, 32, t0);
3385       sd(tmp, Address(d));
3386       addi(d, d, wordSize);
3387     }
3388 
3389     void step_squaring() {
3390       // An extra ACC
3391       step();
3392       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3393     }
3394 
3395     void last_squaring(Register i) {
3396       Label dont;
3397       // if ((i & 1) == 0) {
3398       test_bit(t0, i, 0);
3399       bnez(t0, dont); {
3400         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3401         // Ra = *++Pa;
3402         // Rb = *--Pb;
3403         mulhu(Rhi_ab, Ra, Rb);
3404         mul(Rlo_ab, Ra, Rb);
3405         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3406       } bind(dont);
3407     }
3408 
3409     void extra_step_squaring() {
3410       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3411 
3412       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3413       // Rm = *++Pm;
3414       // Rn = *--Pn;
3415       mulhu(Rhi_mn, Rm, Rn);
3416       mul(Rlo_mn, Rm, Rn);
3417       addi(Pm, Pm, wordSize);
3418       ld(Rm, Address(Pm));
3419       addi(Pn, Pn, -wordSize);
3420       ld(Rn, Address(Pn));
3421     }
3422 
3423     void post1_squaring() {
3424       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3425 
3426       // *Pm = Rm = tmp0 * inv;
3427       mul(Rm, tmp0, inv);
3428       sd(Rm, Address(Pm));
3429 
3430       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3431       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3432       mulhu(Rhi_mn, Rm, Rn);
3433 
3434 #ifndef PRODUCT
3435       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3436       {
3437         mul(Rlo_mn, Rm, Rn);
3438         add(Rlo_mn, tmp0, Rlo_mn);
3439         Label ok;
3440         beqz(Rlo_mn, ok); {
3441           stop("broken Montgomery multiply");
3442         } bind(ok);
3443       }
3444 #endif
3445       // We have very carefully set things up so that
3446       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3447       // the lower half of Rm * Rn because we know the result already:
3448       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3449       // tmp0 != 0.  So, rather than do a mul and a cad we just set
3450       // the carry flag iff tmp0 is nonzero.
3451       //
3452       // mul(Rlo_mn, Rm, Rn);
3453       // cad(zr, tmp, Rlo_mn);
3454       addi(t0, tmp0, -1);
3455       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3456       cadc(tmp0, tmp1, Rhi_mn, t0);
3457       adc(tmp1, tmp2, zr, t0);
3458       mv(tmp2, zr);
3459     }
3460 
3461     // use t0 as carry
3462     void acc(Register Rhi, Register Rlo,
3463              Register tmp0, Register tmp1, Register tmp2) {
3464       cad(tmp0, tmp0, Rlo, t0);
3465       cadc(tmp1, tmp1, Rhi, t0);
3466       adc(tmp2, tmp2, zr, t0);
3467     }
3468 
3469   public:
3470     /**
3471      * Fast Montgomery multiplication.  The derivation of the
3472      * algorithm is in A Cryptographic Library for the Motorola
3473      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3474      *
3475      * Arguments:
3476      *
3477      * Inputs for multiplication:
3478      *   c_rarg0   - int array elements a
3479      *   c_rarg1   - int array elements b
3480      *   c_rarg2   - int array elements n (the modulus)
3481      *   c_rarg3   - int length
3482      *   c_rarg4   - int inv
3483      *   c_rarg5   - int array elements m (the result)
3484      *
3485      * Inputs for squaring:
3486      *   c_rarg0   - int array elements a
3487      *   c_rarg1   - int array elements n (the modulus)
3488      *   c_rarg2   - int length
3489      *   c_rarg3   - int inv
3490      *   c_rarg4   - int array elements m (the result)
3491      *
3492      */
3493     address generate_multiply() {
3494       Label argh, nothing;
3495       bind(argh);
3496       stop("MontgomeryMultiply total_allocation must be <= 8192");
3497 
3498       align(CodeEntryAlignment);
3499       address entry = pc();
3500 
3501       beqz(Rlen, nothing);
3502 
3503       enter();
3504 
3505       // Make room.
3506       mv(Ra, 512);
3507       bgt(Rlen, Ra, argh);
3508       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3509       sub(Ra, sp, Ra);
3510       andi(sp, Ra, -2 * wordSize);
3511 
3512       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3513 
3514       {
3515         // Copy input args, reversing as we go.  We use Ra as a
3516         // temporary variable.
3517         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3518         if (!_squaring)
3519           reverse(Ra, Pb_base, Rlen, Ri, Rj);
3520         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3521       }
3522 
3523       // Push all call-saved registers and also Pm_base which we'll need
3524       // at the end.
3525       save_regs();
3526 
3527 #ifndef PRODUCT
3528       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3529       {
3530         ld(Rn, Address(Pn_base));
3531         mul(Rlo_mn, Rn, inv);
3532         mv(t0, -1);
3533         Label ok;
3534         beq(Rlo_mn, t0, ok);
3535         stop("broken inverse in Montgomery multiply");
3536         bind(ok);
3537       }
3538 #endif
3539 
3540       mv(Pm_base, Ra);
3541 
3542       mv(tmp0, zr);
3543       mv(tmp1, zr);
3544       mv(tmp2, zr);
3545 
3546       block_comment("for (int i = 0; i < len; i++) {");
3547       mv(Ri, zr); {
3548         Label loop, end;
3549         bge(Ri, Rlen, end);
3550 
3551         bind(loop);
3552         pre1(Ri);
3553 
3554         block_comment("  for (j = i; j; j--) {"); {
3555           mv(Rj, Ri);
3556           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3557         } block_comment("  } // j");
3558 
3559         post1();
3560         addw(Ri, Ri, 1);
3561         blt(Ri, Rlen, loop);
3562         bind(end);
3563         block_comment("} // i");
3564       }
3565 
3566       block_comment("for (int i = len; i < 2*len; i++) {");
3567       mv(Ri, Rlen); {
3568         Label loop, end;
3569         slli(t0, Rlen, 1);
3570         bge(Ri, t0, end);
3571 
3572         bind(loop);
3573         pre2(Ri, Rlen);
3574 
3575         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3576           slliw(Rj, Rlen, 1);
3577           subw(Rj, Rj, Ri);
3578           subw(Rj, Rj, 1);
3579           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3580         } block_comment("  } // j");
3581 
3582         post2(Ri, Rlen);
3583         addw(Ri, Ri, 1);
3584         slli(t0, Rlen, 1);
3585         blt(Ri, t0, loop);
3586         bind(end);
3587       }
3588       block_comment("} // i");
3589 
3590       normalize(Rlen);
3591 
3592       mv(Ra, Pm_base);  // Save Pm_base in Ra
3593       restore_regs();  // Restore caller's Pm_base
3594 
3595       // Copy our result into caller's Pm_base
3596       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3597 
3598       leave();
3599       bind(nothing);
3600       ret();
3601 
3602       return entry;
3603     }
3604 
3605     /**
3606      *
3607      * Arguments:
3608      *
3609      * Inputs:
3610      *   c_rarg0   - int array elements a
3611      *   c_rarg1   - int array elements n (the modulus)
3612      *   c_rarg2   - int length
3613      *   c_rarg3   - int inv
3614      *   c_rarg4   - int array elements m (the result)
3615      *
3616      */
3617     address generate_square() {
3618       Label argh;
3619       bind(argh);
3620       stop("MontgomeryMultiply total_allocation must be <= 8192");
3621 
3622       align(CodeEntryAlignment);
3623       address entry = pc();
3624 
3625       enter();
3626 
3627       // Make room.
3628       mv(Ra, 512);
3629       bgt(Rlen, Ra, argh);
3630       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3631       sub(Ra, sp, Ra);
3632       andi(sp, Ra, -2 * wordSize);
3633 
3634       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3635 
3636       {
3637         // Copy input args, reversing as we go.  We use Ra as a
3638         // temporary variable.
3639         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3640         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3641       }
3642 
3643       // Push all call-saved registers and also Pm_base which we'll need
3644       // at the end.
3645       save_regs();
3646 
3647       mv(Pm_base, Ra);
3648 
3649       mv(tmp0, zr);
3650       mv(tmp1, zr);
3651       mv(tmp2, zr);
3652 
3653       block_comment("for (int i = 0; i < len; i++) {");
3654       mv(Ri, zr); {
3655         Label loop, end;
3656         bind(loop);
3657         bge(Ri, Rlen, end);
3658 
3659         pre1(Ri);
3660 
3661         block_comment("for (j = (i+1)/2; j; j--) {"); {
3662           addi(Rj, Ri, 1);
3663           srliw(Rj, Rj, 1);
3664           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3665         } block_comment("  } // j");
3666 
3667         last_squaring(Ri);
3668 
3669         block_comment("  for (j = i/2; j; j--) {"); {
3670           srliw(Rj, Ri, 1);
3671           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3672         } block_comment("  } // j");
3673 
3674         post1_squaring();
3675         addi(Ri, Ri, 1);
3676         blt(Ri, Rlen, loop);
3677 
3678         bind(end);
3679         block_comment("} // i");
3680       }
3681 
3682       block_comment("for (int i = len; i < 2*len; i++) {");
3683       mv(Ri, Rlen); {
3684         Label loop, end;
3685         bind(loop);
3686         slli(t0, Rlen, 1);
3687         bge(Ri, t0, end);
3688 
3689         pre2(Ri, Rlen);
3690 
3691         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
3692           slli(Rj, Rlen, 1);
3693           sub(Rj, Rj, Ri);
3694           sub(Rj, Rj, 1);
3695           srliw(Rj, Rj, 1);
3696           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3697         } block_comment("  } // j");
3698 
3699         last_squaring(Ri);
3700 
3701         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
3702           slli(Rj, Rlen, 1);
3703           sub(Rj, Rj, Ri);
3704           srliw(Rj, Rj, 1);
3705           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3706         } block_comment("  } // j");
3707 
3708         post2(Ri, Rlen);
3709         addi(Ri, Ri, 1);
3710         slli(t0, Rlen, 1);
3711         blt(Ri, t0, loop);
3712 
3713         bind(end);
3714         block_comment("} // i");
3715       }
3716 
3717       normalize(Rlen);
3718 
3719       mv(Ra, Pm_base);  // Save Pm_base in Ra
3720       restore_regs();  // Restore caller's Pm_base
3721 
3722       // Copy our result into caller's Pm_base
3723       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3724 
3725       leave();
3726       ret();
3727 
3728       return entry;
3729     }
3730   };
3731 
3732 #endif // COMPILER2
3733 
3734   address generate_cont_thaw(Continuation::thaw_kind kind) {
3735     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
3736     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
3737 
3738     address start = __ pc();
3739 
3740     if (return_barrier) {
3741       __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
3742     }
3743 
3744 #ifndef PRODUCT
3745     {
3746       Label OK;
3747       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
3748       __ beq(sp, t0, OK);
3749       __ stop("incorrect sp");
3750       __ bind(OK);
3751     }
3752 #endif
3753 
3754     if (return_barrier) {
3755       // preserve possible return value from a method returning to the return barrier
3756       __ sub(sp, sp, 2 * wordSize);
3757       __ fsd(f10, Address(sp, 0 * wordSize));
3758       __ sd(x10, Address(sp, 1 * wordSize));
3759     }
3760 
3761     __ mv(c_rarg1, (return_barrier ? 1 : 0));
3762     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
3763     __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
3764 
3765     if (return_barrier) {
3766       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
3767       __ ld(x10, Address(sp, 1 * wordSize));
3768       __ fld(f10, Address(sp, 0 * wordSize));
3769       __ add(sp, sp, 2 * wordSize);
3770     }
3771 
3772 #ifndef PRODUCT
3773     {
3774       Label OK;
3775       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
3776       __ beq(sp, t0, OK);
3777       __ stop("incorrect sp");
3778       __ bind(OK);
3779     }
3780 #endif
3781 
3782     Label thaw_success;
3783     // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
3784     __ bnez(t1, thaw_success);
3785     __ la(t0, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
3786     __ jr(t0);
3787     __ bind(thaw_success);
3788 
3789     // make room for the thawed frames
3790     __ sub(t0, sp, t1);
3791     __ andi(sp, t0, -16); // align
3792 
3793     if (return_barrier) {
3794       // save original return value -- again
3795       __ sub(sp, sp, 2 * wordSize);
3796       __ fsd(f10, Address(sp, 0 * wordSize));
3797       __ sd(x10, Address(sp, 1 * wordSize));
3798     }
3799 
3800     // If we want, we can templatize thaw by kind, and have three different entries
3801     __ mv(c_rarg1, kind);
3802 
3803     __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
3804     __ mv(t1, x10); // x10 is the sp of the yielding frame
3805 
3806     if (return_barrier) {
3807       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
3808       __ ld(x10, Address(sp, 1 * wordSize));
3809       __ fld(f10, Address(sp, 0 * wordSize));
3810       __ add(sp, sp, 2 * wordSize);
3811     } else {
3812       __ mv(x10, zr); // return 0 (success) from doYield
3813     }
3814 
3815     // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
3816     __ mv(fp, t1);
3817     __ sub(sp, t1, 2 * wordSize); // now pointing to fp spill
3818 
3819     if (return_barrier_exception) {
3820       __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
3821       __ verify_oop(x10);
3822       __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
3823 
3824       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
3825 
3826       // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
3827 
3828       __ mv(x11, x10); // the exception handler
3829       __ mv(x10, x9); // restore return value contaning the exception oop
3830       __ verify_oop(x10);
3831 
3832       __ leave();
3833       __ mv(x13, ra);
3834       __ jr(x11); // the exception handler
3835     } else {
3836       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
3837       __ leave();
3838       __ ret();
3839     }
3840 
3841     return start;
3842   }
3843 
3844   address generate_cont_thaw() {
3845     if (!Continuations::enabled()) return nullptr;
3846 
3847     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
3848     address start = __ pc();
3849     generate_cont_thaw(Continuation::thaw_top);
3850     return start;
3851   }
3852 
3853   address generate_cont_returnBarrier() {
3854     if (!Continuations::enabled()) return nullptr;
3855 
3856     // TODO: will probably need multiple return barriers depending on return type
3857     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
3858     address start = __ pc();
3859 
3860     generate_cont_thaw(Continuation::thaw_return_barrier);
3861 
3862     return start;
3863   }
3864 
3865   address generate_cont_returnBarrier_exception() {
3866     if (!Continuations::enabled()) return nullptr;
3867 
3868     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
3869     address start = __ pc();
3870 
3871     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
3872 
3873     return start;
3874   }
3875 
3876   address generate_cont_preempt_stub() {
3877     if (!Continuations::enabled()) return nullptr;
3878     StubCodeMark mark(this, "StubRoutines","Continuation preempt stub");
3879     address start = __ pc();
3880 
3881     __ reset_last_Java_frame(true);
3882 
3883     // reset the flag
3884     __ sb(zr, Address(xthread, JavaThread::preempting_offset()));
3885 
3886     // Set sp to enterSpecial frame and then remove it from the stack
3887     __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
3888 
3889     Label preemption_cancelled;
3890     __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
3891     __ bnez(t0, preemption_cancelled);
3892 
3893     // Remove enterSpecial frame from the stack and return to Continuation.run()
3894     SharedRuntime::continuation_enter_cleanup(_masm);
3895     __ leave();
3896     __ ret();
3897 
3898     __ bind(preemption_cancelled);
3899     __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
3900     __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
3901     __ la(t0, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
3902     __ ld(t0, Address(t0));
3903     __ jr(t0);
3904 
3905     return start;
3906   }
3907 
3908 #if COMPILER2_OR_JVMCI
3909 
3910 #undef __
3911 #define __ this->
3912 
3913   class Sha2Generator : public MacroAssembler {
3914     StubCodeGenerator* _cgen;
3915    public:
3916       Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
3917       address generate_sha256_implCompress(bool multi_block) {
3918         return generate_sha2_implCompress(Assembler::e32, multi_block);
3919       }
3920       address generate_sha512_implCompress(bool multi_block) {
3921         return generate_sha2_implCompress(Assembler::e64, multi_block);
3922       }
3923    private:
3924 
3925     void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
3926       if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
3927       else                            __ vle64_v(vr, sr);
3928     }
3929 
3930     void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
3931       if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
3932       else                            __ vse64_v(vr, sr);
3933     }
3934 
3935     // Overview of the logic in each "quad round".
3936     //
3937     // The code below repeats 16/20 times the logic implementing four rounds
3938     // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
3939     // to implementing the 64/80 single rounds.
3940     //
3941     //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
3942     //    // Output:
3943     //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
3944     //    vl1reXX.v vTmp1, ofs
3945     //
3946     //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
3947     //    addi ofs, ofs, 16/32
3948     //
3949     //    // Add constants to message schedule words:
3950     //    //  Input
3951     //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
3952     //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
3953     //    //  Output
3954     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
3955     //    vadd.vv vTmp0, vTmp1, vW0
3956     //
3957     //    //  2 rounds of working variables updates.
3958     //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
3959     //    //  Input:
3960     //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
3961     //    //    vState0 = {a[t],b[t],e[t],f[t]}
3962     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
3963     //    //  Output:
3964     //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
3965     //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
3966     //    vsha2cl.vv vState1, vState0, vTmp0
3967     //
3968     //    //  2 rounds of working variables updates.
3969     //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
3970     //    //  Input
3971     //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
3972     //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
3973     //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
3974     //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
3975     //    //  Output:
3976     //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
3977     //    vsha2ch.vv vState0, vState1, vTmp0
3978     //
3979     //    // Combine 2QW into 1QW
3980     //    //
3981     //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
3982     //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
3983     //    // and it can only take 3 vectors as inputs. Hence we need to combine
3984     //    // vW1[0] and vW2[1..3] in a single vector.
3985     //    //
3986     //    // vmerge Vt4, Vt1, Vt2, V0
3987     //    // Input
3988     //    //  V0 = mask // first word from vW2, 1..3 words from vW1
3989     //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
3990     //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
3991     //    // Output
3992     //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
3993     //    vmerge.vvm vTmp0, vW2, vW1, v0
3994     //
3995     //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
3996     //    // Input
3997     //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
3998     //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
3999     //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
4000     //    // Output (next four message schedule words)
4001     //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
4002     //    vsha2ms.vv vW0, vTmp0, vW3
4003     //
4004     // BEFORE
4005     //  vW0 - vW3 hold the message schedule words (initially the block words)
4006     //    vW0 = W[ 3: 0]   "oldest"
4007     //    vW1 = W[ 7: 4]
4008     //    vW2 = W[11: 8]
4009     //    vW3 = W[15:12]   "newest"
4010     //
4011     //  vt6 - vt7 hold the working state variables
4012     //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
4013     //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
4014     //
4015     // AFTER
4016     //  vW0 - vW3 hold the message schedule words (initially the block words)
4017     //    vW1 = W[ 7: 4]   "oldest"
4018     //    vW2 = W[11: 8]
4019     //    vW3 = W[15:12]
4020     //    vW0 = W[19:16]   "newest"
4021     //
4022     //  vState0 and vState1 hold the working state variables
4023     //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4024     //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4025     //
4026     //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4027     //  hence the uses of those vectors rotate in each round, and we get back to the
4028     //  initial configuration every 4 quad-rounds. We could avoid those changes at
4029     //  the cost of moving those vectors at the end of each quad-rounds.
4030     void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4031                          Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4032                          bool gen_words = true, bool step_const = true) {
4033       __ vleXX_v(vset_sew, vtemp, scalarconst);
4034       if (step_const) {
4035         __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4036       }
4037       __ vadd_vv(vtemp2, vtemp, rot1);
4038       __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4039       __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4040       if (gen_words) {
4041         __ vmerge_vvm(vtemp2, rot3, rot2);
4042         __ vsha2ms_vv(rot1, vtemp2, rot4);
4043       }
4044     }
4045 
4046     const char* stub_name(Assembler::SEW vset_sew, bool multi_block) {
4047       if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress";
4048       if (vset_sew == Assembler::e32 &&  multi_block) return "sha256_implCompressMB";
4049       if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress";
4050       if (vset_sew == Assembler::e64 &&  multi_block) return "sha512_implCompressMB";
4051       ShouldNotReachHere();
4052       return "bad name lookup";
4053     }
4054 
4055     // Arguments:
4056     //
4057     // Inputs:
4058     //   c_rarg0   - byte[]  source+offset
4059     //   c_rarg1   - int[]   SHA.state
4060     //   c_rarg2   - int     offset
4061     //   c_rarg3   - int     limit
4062     //
4063     address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) {
4064       alignas(64) static const uint32_t round_consts_256[64] = {
4065         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4066         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4067         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4068         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4069         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4070         0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4071         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4072         0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4073         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4074         0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4075         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4076         0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4077         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4078         0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4079         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4080         0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4081       };
4082       alignas(64) static const uint64_t round_consts_512[80] = {
4083         0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4084         0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4085         0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4086         0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4087         0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4088         0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4089         0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4090         0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4091         0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4092         0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4093         0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4094         0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4095         0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4096         0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4097         0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4098         0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4099         0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4100         0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4101         0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4102         0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4103         0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4104         0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4105         0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4106         0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4107         0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4108         0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4109         0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4110       };
4111       const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4112 
4113       __ align(CodeEntryAlignment);
4114       StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block));
4115       address start = __ pc();
4116 
4117       Register buf   = c_rarg0;
4118       Register state = c_rarg1;
4119       Register ofs   = c_rarg2;
4120       Register limit = c_rarg3;
4121       Register consts =  t2; // caller saved
4122       Register state_c = x28; // caller saved
4123       VectorRegister vindex = v2;
4124       VectorRegister vW0 = v4;
4125       VectorRegister vW1 = v6;
4126       VectorRegister vW2 = v8;
4127       VectorRegister vW3 = v10;
4128       VectorRegister vState0 = v12;
4129       VectorRegister vState1 = v14;
4130       VectorRegister vHash0  = v16;
4131       VectorRegister vHash1  = v18;
4132       VectorRegister vTmp0   = v20;
4133       VectorRegister vTmp1   = v22;
4134 
4135       Label multi_block_loop;
4136 
4137       __ enter();
4138 
4139       address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
4140       la(consts, ExternalAddress(constant_table));
4141 
4142       // Register use in this function:
4143       //
4144       // VECTORS
4145       //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
4146       //             schedule words (Wt). They start with the message block
4147       //             content (W0 to W15), then further words in the message
4148       //             schedule generated via vsha2ms from previous Wt.
4149       //   Initially:
4150       //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
4151       //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
4152       //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
4153       //     vW3 = W[15:12] = {W15, W14, W13, W12}
4154       //
4155       //  vState0 - vState1 hold the working state variables (a, b, ..., h)
4156       //    vState0 = {f[t],e[t],b[t],a[t]}
4157       //    vState1 = {h[t],g[t],d[t],c[t]}
4158       //   Initially:
4159       //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
4160       //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
4161       //
4162       //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
4163       //
4164       //  vTmp0 = temporary, Wt+Kt
4165       //  vTmp1 = temporary, Kt
4166       //
4167       //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
4168       //
4169       // During most of the function the vector state is configured so that each
4170       // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
4171 
4172       // vsha2ch/vsha2cl uses EGW of 4*SEW.
4173       // SHA256 SEW = e32, EGW = 128-bits
4174       // SHA512 SEW = e64, EGW = 256-bits
4175       //
4176       // VLEN is required to be at least 128.
4177       // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
4178       //
4179       // m1: LMUL=1/2
4180       // ta: tail agnostic (don't care about those lanes)
4181       // ma: mask agnostic (don't care about those lanes)
4182       // x0 is not written, we known the number of vector elements.
4183 
4184       if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
4185         __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
4186       } else {
4187         __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
4188       }
4189 
4190       int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
4191       __ li(t0, indexes);
4192       __ vmv_v_x(vindex, t0);
4193 
4194       // Step-over a,b, so we are pointing to c.
4195       // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
4196       __ addi(state_c, state, const_add/2);
4197 
4198       // Use index-load to get {f,e,b,a},{h,g,d,c}
4199       __ vluxei8_v(vState0, state, vindex);
4200       __ vluxei8_v(vState1, state_c, vindex);
4201 
4202       __ bind(multi_block_loop);
4203 
4204       // Capture the initial H values in vHash0 and vHash1 to allow for computing
4205       // the resulting H', since H' = H+{a',b',c',...,h'}.
4206       __ vmv_v_v(vHash0, vState0);
4207       __ vmv_v_v(vHash1, vState1);
4208 
4209       // Load the 512/1024-bits of the message block in vW0-vW3 and perform
4210       // an endian swap on each 4/8 bytes element.
4211       //
4212       // If Zvkb is not implemented one can use vrgather
4213       // with an index sequence to byte-swap.
4214       //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
4215       //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
4216       //  this sequence. 'vid' gives us the N.
4217       __ vleXX_v(vset_sew, vW0, buf);
4218       __ vrev8_v(vW0, vW0);
4219       __ addi(buf, buf, const_add);
4220       __ vleXX_v(vset_sew, vW1, buf);
4221       __ vrev8_v(vW1, vW1);
4222       __ addi(buf, buf, const_add);
4223       __ vleXX_v(vset_sew, vW2, buf);
4224       __ vrev8_v(vW2, vW2);
4225       __ addi(buf, buf, const_add);
4226       __ vleXX_v(vset_sew, vW3, buf);
4227       __ vrev8_v(vW3, vW3);
4228       __ addi(buf, buf, const_add);
4229 
4230       // Set v0 up for the vmerge that replaces the first word (idx==0)
4231       __ vid_v(v0);
4232       __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
4233 
4234       VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
4235       int rot_pos = 0;
4236       // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
4237       const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
4238       for (int i = 0; i < qr_end; i++) {
4239         sha2_quad_round(vset_sew,
4240                    rotation_regs[(rot_pos + 0) & 0x3],
4241                    rotation_regs[(rot_pos + 1) & 0x3],
4242                    rotation_regs[(rot_pos + 2) & 0x3],
4243                    rotation_regs[(rot_pos + 3) & 0x3],
4244                    consts,
4245                    vTmp1, vTmp0, vState0, vState1);
4246         ++rot_pos;
4247       }
4248       // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
4249       // Note that we stop generating new message schedule words (Wt, vW0-13)
4250       // as we already generated all the words we end up consuming (i.e., W[63:60]).
4251       const int qr_c_end = qr_end + 4;
4252       for (int i = qr_end; i < qr_c_end; i++) {
4253         sha2_quad_round(vset_sew,
4254                    rotation_regs[(rot_pos + 0) & 0x3],
4255                    rotation_regs[(rot_pos + 1) & 0x3],
4256                    rotation_regs[(rot_pos + 2) & 0x3],
4257                    rotation_regs[(rot_pos + 3) & 0x3],
4258                    consts,
4259                    vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
4260         ++rot_pos;
4261       }
4262 
4263       //--------------------------------------------------------------------------------
4264       // Compute the updated hash value H'
4265       //   H' = H + {h',g',...,b',a'}
4266       //      = {h,g,...,b,a} + {h',g',...,b',a'}
4267       //      = {h+h',g+g',...,b+b',a+a'}
4268 
4269       // H' = H+{a',b',c',...,h'}
4270       __ vadd_vv(vState0, vHash0, vState0);
4271       __ vadd_vv(vState1, vHash1, vState1);
4272 
4273       if (multi_block) {
4274         int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
4275         __ addi(consts, consts, -total_adds);
4276         __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
4277         __ ble(ofs, limit, multi_block_loop);
4278         __ mv(c_rarg0, ofs); // return ofs
4279       }
4280 
4281       // Store H[0..8] = {a,b,c,d,e,f,g,h} from
4282       //  vState0 = {f,e,b,a}
4283       //  vState1 = {h,g,d,c}
4284       __ vsuxei8_v(vState0, state,   vindex);
4285       __ vsuxei8_v(vState1, state_c, vindex);
4286 
4287       __ leave();
4288       __ ret();
4289 
4290       return start;
4291     }
4292   };
4293 
4294 #undef __
4295 #define __ _masm->
4296 
4297   // Set of L registers that correspond to a contiguous memory area.
4298   // Each 64-bit register typically corresponds to 2 32-bit integers.
4299   template <uint L>
4300   class RegCache {
4301   private:
4302     MacroAssembler *_masm;
4303     Register _regs[L];
4304 
4305   public:
4306     RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
4307       assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
4308       auto it = rs.begin();
4309       for (auto &r: _regs) {
4310         r = *it;
4311         ++it;
4312       }
4313     }
4314 
4315     // generate load for the i'th register
4316     void gen_load(uint i, Register base) {
4317       assert(i < L, "invalid i: %u", i);
4318       __ ld(_regs[i], Address(base, 8 * i));
4319     }
4320 
4321     // add i'th 32-bit integer to dest
4322     void add_u32(const Register dest, uint i, const Register rtmp = t0) {
4323       assert(i < 2 * L, "invalid i: %u", i);
4324 
4325       if (is_even(i)) {
4326         // Use the bottom 32 bits. No need to mask off the top 32 bits
4327         // as addw will do the right thing.
4328         __ addw(dest, dest, _regs[i / 2]);
4329       } else {
4330         // Use the top 32 bits by right-shifting them.
4331         __ srli(rtmp, _regs[i / 2], 32);
4332         __ addw(dest, dest, rtmp);
4333       }
4334     }
4335   };
4336 
4337   typedef RegCache<8> BufRegCache;
4338 
4339   // a += value + x + ac;
4340   // a = Integer.rotateLeft(a, s) + b;
4341   void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
4342                                Register a, Register b, Register c, Register d,
4343                                int k, int s, int t,
4344                                Register value) {
4345     // a += ac
4346     __ addw(a, a, t, t1);
4347 
4348     // a += x;
4349     reg_cache.add_u32(a, k);
4350     // a += value;
4351     __ addw(a, a, value);
4352 
4353     // a = Integer.rotateLeft(a, s) + b;
4354     __ rolw_imm(a, a, s);
4355     __ addw(a, a, b);
4356   }
4357 
4358   // a += ((b & c) | ((~b) & d)) + x + ac;
4359   // a = Integer.rotateLeft(a, s) + b;
4360   void md5_FF(BufRegCache& reg_cache,
4361               Register a, Register b, Register c, Register d,
4362               int k, int s, int t,
4363               Register rtmp1, Register rtmp2) {
4364     // rtmp1 = b & c
4365     __ andr(rtmp1, b, c);
4366 
4367     // rtmp2 = (~b) & d
4368     __ andn(rtmp2, d, b);
4369 
4370     // rtmp1 = (b & c) | ((~b) & d)
4371     __ orr(rtmp1, rtmp1, rtmp2);
4372 
4373     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4374   }
4375 
4376   // a += ((b & d) | (c & (~d))) + x + ac;
4377   // a = Integer.rotateLeft(a, s) + b;
4378   void md5_GG(BufRegCache& reg_cache,
4379               Register a, Register b, Register c, Register d,
4380               int k, int s, int t,
4381               Register rtmp1, Register rtmp2) {
4382     // rtmp1 = b & d
4383     __ andr(rtmp1, b, d);
4384 
4385     // rtmp2 = c & (~d)
4386     __ andn(rtmp2, c, d);
4387 
4388     // rtmp1 = (b & d) | (c & (~d))
4389     __ orr(rtmp1, rtmp1, rtmp2);
4390 
4391     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4392   }
4393 
4394   // a += ((b ^ c) ^ d) + x + ac;
4395   // a = Integer.rotateLeft(a, s) + b;
4396   void md5_HH(BufRegCache& reg_cache,
4397               Register a, Register b, Register c, Register d,
4398               int k, int s, int t,
4399               Register rtmp1, Register rtmp2) {
4400     // rtmp1 = (b ^ c) ^ d
4401     __ xorr(rtmp2, b, c);
4402     __ xorr(rtmp1, rtmp2, d);
4403 
4404     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4405   }
4406 
4407   // a += (c ^ (b | (~d))) + x + ac;
4408   // a = Integer.rotateLeft(a, s) + b;
4409   void md5_II(BufRegCache& reg_cache,
4410               Register a, Register b, Register c, Register d,
4411               int k, int s, int t,
4412               Register rtmp1, Register rtmp2) {
4413     // rtmp1 = c ^ (b | (~d))
4414     __ orn(rtmp2, b, d);
4415     __ xorr(rtmp1, c, rtmp2);
4416 
4417     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4418   }
4419 
4420   // Arguments:
4421   //
4422   // Inputs:
4423   //   c_rarg0   - byte[]  source+offset
4424   //   c_rarg1   - int[]   SHA.state
4425   //   c_rarg2   - int     offset  (multi_block == True)
4426   //   c_rarg3   - int     limit   (multi_block == True)
4427   //
4428   // Registers:
4429   //    x0   zero  (zero)
4430   //    x1     ra  (return address)
4431   //    x2     sp  (stack pointer)
4432   //    x3     gp  (global pointer)
4433   //    x4     tp  (thread pointer)
4434   //    x5     t0  (tmp register)
4435   //    x6     t1  (tmp register)
4436   //    x7     t2  state0
4437   //    x8  f0/s0  (frame pointer)
4438   //    x9     s1
4439   //   x10     a0  rtmp1 / c_rarg0
4440   //   x11     a1  rtmp2 / c_rarg1
4441   //   x12     a2  a     / c_rarg2
4442   //   x13     a3  b     / c_rarg3
4443   //   x14     a4  c
4444   //   x15     a5  d
4445   //   x16     a6  buf
4446   //   x17     a7  state
4447   //   x18     s2  ofs     [saved-reg]  (multi_block == True)
4448   //   x19     s3  limit   [saved-reg]  (multi_block == True)
4449   //   x20     s4  state1  [saved-reg]
4450   //   x21     s5  state2  [saved-reg]
4451   //   x22     s6  state3  [saved-reg]
4452   //   x23     s7
4453   //   x24     s8  buf0    [saved-reg]
4454   //   x25     s9  buf1    [saved-reg]
4455   //   x26    s10  buf2    [saved-reg]
4456   //   x27    s11  buf3    [saved-reg]
4457   //   x28     t3  buf4
4458   //   x29     t4  buf5
4459   //   x30     t5  buf6
4460   //   x31     t6  buf7
4461   address generate_md5_implCompress(bool multi_block, const char *name) {
4462     __ align(CodeEntryAlignment);
4463     StubCodeMark mark(this, "StubRoutines", name);
4464     address start = __ pc();
4465 
4466     // rotation constants
4467     const int S11 = 7;
4468     const int S12 = 12;
4469     const int S13 = 17;
4470     const int S14 = 22;
4471     const int S21 = 5;
4472     const int S22 = 9;
4473     const int S23 = 14;
4474     const int S24 = 20;
4475     const int S31 = 4;
4476     const int S32 = 11;
4477     const int S33 = 16;
4478     const int S34 = 23;
4479     const int S41 = 6;
4480     const int S42 = 10;
4481     const int S43 = 15;
4482     const int S44 = 21;
4483 
4484     const int64_t mask32 = 0xffffffff;
4485 
4486     Register buf_arg   = c_rarg0; // a0
4487     Register state_arg = c_rarg1; // a1
4488     Register ofs_arg   = c_rarg2; // a2
4489     Register limit_arg = c_rarg3; // a3
4490 
4491     // we'll copy the args to these registers to free up a0-a3
4492     // to use for other values manipulated by instructions
4493     // that can be compressed
4494     Register buf       = x16; // a6
4495     Register state     = x17; // a7
4496     Register ofs       = x18; // s2
4497     Register limit     = x19; // s3
4498 
4499     // using x12->15 to allow compressed instructions
4500     Register a         = x12; // a2
4501     Register b         = x13; // a3
4502     Register c         = x14; // a4
4503     Register d         = x15; // a5
4504 
4505     Register state0    =  x7; // t2
4506     Register state1    = x20; // s4
4507     Register state2    = x21; // s5
4508     Register state3    = x22; // s6
4509 
4510     // using x10->x11 to allow compressed instructions
4511     Register rtmp1     = x10; // a0
4512     Register rtmp2     = x11; // a1
4513 
4514     RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
4515     RegSet reg_cache_regs;
4516     reg_cache_regs += reg_cache_saved_regs;
4517     reg_cache_regs += RegSet::of(x28, x29, x30, x31); // t3, t4, t5, t6
4518     BufRegCache reg_cache(_masm, reg_cache_regs);
4519 
4520     RegSet saved_regs;
4521     if (multi_block) {
4522       saved_regs += RegSet::of(ofs, limit);
4523     }
4524     saved_regs += RegSet::of(state1, state2, state3);
4525     saved_regs += reg_cache_saved_regs;
4526 
4527     __ push_reg(saved_regs, sp);
4528 
4529     __ mv(buf, buf_arg);
4530     __ mv(state, state_arg);
4531     if (multi_block) {
4532       __ mv(ofs, ofs_arg);
4533       __ mv(limit, limit_arg);
4534     }
4535 
4536     // to minimize the number of memory operations:
4537     // read the 4 state 4-byte values in pairs, with a single ld,
4538     // and split them into 2 registers.
4539     //
4540     // And, as the core algorithm of md5 works on 32-bits words, so
4541     // in the following code, it does not care about the content of
4542     // higher 32-bits in state[x]. Based on this observation,
4543     // we can apply further optimization, which is to just ignore the
4544     // higher 32-bits in state0/state2, rather than set the higher
4545     // 32-bits of state0/state2 to zero explicitly with extra instructions.
4546     __ ld(state0, Address(state));
4547     __ srli(state1, state0, 32);
4548     __ ld(state2, Address(state, 8));
4549     __ srli(state3, state2, 32);
4550 
4551     Label md5_loop;
4552     __ BIND(md5_loop);
4553 
4554     __ mv(a, state0);
4555     __ mv(b, state1);
4556     __ mv(c, state2);
4557     __ mv(d, state3);
4558 
4559     // Round 1
4560     reg_cache.gen_load(0, buf);
4561     md5_FF(reg_cache, a, b, c, d,  0, S11, 0xd76aa478, rtmp1, rtmp2);
4562     md5_FF(reg_cache, d, a, b, c,  1, S12, 0xe8c7b756, rtmp1, rtmp2);
4563     reg_cache.gen_load(1, buf);
4564     md5_FF(reg_cache, c, d, a, b,  2, S13, 0x242070db, rtmp1, rtmp2);
4565     md5_FF(reg_cache, b, c, d, a,  3, S14, 0xc1bdceee, rtmp1, rtmp2);
4566     reg_cache.gen_load(2, buf);
4567     md5_FF(reg_cache, a, b, c, d,  4, S11, 0xf57c0faf, rtmp1, rtmp2);
4568     md5_FF(reg_cache, d, a, b, c,  5, S12, 0x4787c62a, rtmp1, rtmp2);
4569     reg_cache.gen_load(3, buf);
4570     md5_FF(reg_cache, c, d, a, b,  6, S13, 0xa8304613, rtmp1, rtmp2);
4571     md5_FF(reg_cache, b, c, d, a,  7, S14, 0xfd469501, rtmp1, rtmp2);
4572     reg_cache.gen_load(4, buf);
4573     md5_FF(reg_cache, a, b, c, d,  8, S11, 0x698098d8, rtmp1, rtmp2);
4574     md5_FF(reg_cache, d, a, b, c,  9, S12, 0x8b44f7af, rtmp1, rtmp2);
4575     reg_cache.gen_load(5, buf);
4576     md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
4577     md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
4578     reg_cache.gen_load(6, buf);
4579     md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
4580     md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
4581     reg_cache.gen_load(7, buf);
4582     md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
4583     md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
4584 
4585     // Round 2
4586     md5_GG(reg_cache, a, b, c, d,  1, S21, 0xf61e2562, rtmp1, rtmp2);
4587     md5_GG(reg_cache, d, a, b, c,  6, S22, 0xc040b340, rtmp1, rtmp2);
4588     md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
4589     md5_GG(reg_cache, b, c, d, a,  0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
4590     md5_GG(reg_cache, a, b, c, d,  5, S21, 0xd62f105d, rtmp1, rtmp2);
4591     md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
4592     md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
4593     md5_GG(reg_cache, b, c, d, a,  4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
4594     md5_GG(reg_cache, a, b, c, d,  9, S21, 0x21e1cde6, rtmp1, rtmp2);
4595     md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
4596     md5_GG(reg_cache, c, d, a, b,  3, S23, 0xf4d50d87, rtmp1, rtmp2);
4597     md5_GG(reg_cache, b, c, d, a,  8, S24, 0x455a14ed, rtmp1, rtmp2);
4598     md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
4599     md5_GG(reg_cache, d, a, b, c,  2, S22, 0xfcefa3f8, rtmp1, rtmp2);
4600     md5_GG(reg_cache, c, d, a, b,  7, S23, 0x676f02d9, rtmp1, rtmp2);
4601     md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
4602 
4603     // Round 3
4604     md5_HH(reg_cache, a, b, c, d,  5, S31, 0xfffa3942, rtmp1, rtmp2);
4605     md5_HH(reg_cache, d, a, b, c,  8, S32, 0x8771f681, rtmp1, rtmp2);
4606     md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
4607     md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
4608     md5_HH(reg_cache, a, b, c, d,  1, S31, 0xa4beea44, rtmp1, rtmp2);
4609     md5_HH(reg_cache, d, a, b, c,  4, S32, 0x4bdecfa9, rtmp1, rtmp2);
4610     md5_HH(reg_cache, c, d, a, b,  7, S33, 0xf6bb4b60, rtmp1, rtmp2);
4611     md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
4612     md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
4613     md5_HH(reg_cache, d, a, b, c,  0, S32, 0xeaa127fa, rtmp1, rtmp2);
4614     md5_HH(reg_cache, c, d, a, b,  3, S33, 0xd4ef3085, rtmp1, rtmp2);
4615     md5_HH(reg_cache, b, c, d, a,  6, S34, 0x04881d05, rtmp1, rtmp2);
4616     md5_HH(reg_cache, a, b, c, d,  9, S31, 0xd9d4d039, rtmp1, rtmp2);
4617     md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
4618     md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
4619     md5_HH(reg_cache, b, c, d, a,  2, S34, 0xc4ac5665, rtmp1, rtmp2);
4620 
4621     // Round 4
4622     md5_II(reg_cache, a, b, c, d,  0, S41, 0xf4292244, rtmp1, rtmp2);
4623     md5_II(reg_cache, d, a, b, c,  7, S42, 0x432aff97, rtmp1, rtmp2);
4624     md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
4625     md5_II(reg_cache, b, c, d, a,  5, S44, 0xfc93a039, rtmp1, rtmp2);
4626     md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
4627     md5_II(reg_cache, d, a, b, c,  3, S42, 0x8f0ccc92, rtmp1, rtmp2);
4628     md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
4629     md5_II(reg_cache, b, c, d, a,  1, S44, 0x85845dd1, rtmp1, rtmp2);
4630     md5_II(reg_cache, a, b, c, d,  8, S41, 0x6fa87e4f, rtmp1, rtmp2);
4631     md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
4632     md5_II(reg_cache, c, d, a, b,  6, S43, 0xa3014314, rtmp1, rtmp2);
4633     md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
4634     md5_II(reg_cache, a, b, c, d,  4, S41, 0xf7537e82, rtmp1, rtmp2);
4635     md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
4636     md5_II(reg_cache, c, d, a, b,  2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
4637     md5_II(reg_cache, b, c, d, a,  9, S44, 0xeb86d391, rtmp1, rtmp2);
4638 
4639     __ addw(state0, state0, a);
4640     __ addw(state1, state1, b);
4641     __ addw(state2, state2, c);
4642     __ addw(state3, state3, d);
4643 
4644     if (multi_block) {
4645       __ addi(buf, buf, 64);
4646       __ addi(ofs, ofs, 64);
4647       // if (ofs <= limit) goto m5_loop
4648       __ bge(limit, ofs, md5_loop);
4649       __ mv(c_rarg0, ofs); // return ofs
4650     }
4651 
4652     // to minimize the number of memory operations:
4653     // write back the 4 state 4-byte values in pairs, with a single sd
4654     __ mv(t0, mask32);
4655     __ andr(state0, state0, t0);
4656     __ slli(state1, state1, 32);
4657     __ orr(state0, state0, state1);
4658     __ sd(state0, Address(state));
4659     __ andr(state2, state2, t0);
4660     __ slli(state3, state3, 32);
4661     __ orr(state2, state2, state3);
4662     __ sd(state2, Address(state, 8));
4663 
4664     __ pop_reg(saved_regs, sp);
4665     __ ret();
4666 
4667     return (address) start;
4668   }
4669 
4670   /**
4671    * Perform the quarter round calculations on values contained within four vector registers.
4672    *
4673    * @param aVec the SIMD register containing only the "a" values
4674    * @param bVec the SIMD register containing only the "b" values
4675    * @param cVec the SIMD register containing only the "c" values
4676    * @param dVec the SIMD register containing only the "d" values
4677    * @param tmp_vr temporary vector register holds intermedia values.
4678    */
4679   void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
4680                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
4681     // a += b, d ^= a, d <<<= 16
4682     __ vadd_vv(aVec, aVec, bVec);
4683     __ vxor_vv(dVec, dVec, aVec);
4684     __ vrole32_vi(dVec, 16, tmp_vr);
4685 
4686     // c += d, b ^= c, b <<<= 12
4687     __ vadd_vv(cVec, cVec, dVec);
4688     __ vxor_vv(bVec, bVec, cVec);
4689     __ vrole32_vi(bVec, 12, tmp_vr);
4690 
4691     // a += b, d ^= a, d <<<= 8
4692     __ vadd_vv(aVec, aVec, bVec);
4693     __ vxor_vv(dVec, dVec, aVec);
4694     __ vrole32_vi(dVec, 8, tmp_vr);
4695 
4696     // c += d, b ^= c, b <<<= 7
4697     __ vadd_vv(cVec, cVec, dVec);
4698     __ vxor_vv(bVec, bVec, cVec);
4699     __ vrole32_vi(bVec, 7, tmp_vr);
4700   }
4701 
4702   /**
4703    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
4704    *
4705    *  Input arguments:
4706    *  c_rarg0   - state, the starting state
4707    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
4708    *
4709    *  Implementation Note:
4710    *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
4711    *   N depends on single vector register length.
4712    */
4713   address generate_chacha20Block() {
4714     Label L_Rounds;
4715 
4716     __ align(CodeEntryAlignment);
4717     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4718     address start = __ pc();
4719     __ enter();
4720 
4721     const int states_len = 16;
4722     const int step = 4;
4723     const Register state = c_rarg0;
4724     const Register key_stream = c_rarg1;
4725     const Register tmp_addr = t0;
4726     const Register length = t1;
4727 
4728     // Organize vector registers in an array that facilitates
4729     // putting repetitive opcodes into loop structures below.
4730     const VectorRegister work_vrs[16] = {
4731       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
4732       v8, v9, v10, v11, v12, v13, v14, v15
4733     };
4734     const VectorRegister tmp_vr = v16;
4735     const VectorRegister counter_vr = v17;
4736 
4737     {
4738       // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
4739       // in java level.
4740       __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
4741     }
4742 
4743     // Load from source state.
4744     // Every element in source state is duplicated to all elements in the corresponding vector.
4745     __ mv(tmp_addr, state);
4746     for (int i = 0; i < states_len; i += 1) {
4747       __ vlse32_v(work_vrs[i], tmp_addr, zr);
4748       __ addi(tmp_addr, tmp_addr, step);
4749     }
4750     // Adjust counter for every individual block.
4751     __ vid_v(counter_vr);
4752     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4753 
4754     // Perform 10 iterations of the 8 quarter round set
4755     {
4756       const Register loop = t2; // share t2 with other non-overlapping usages.
4757       __ mv(loop, 10);
4758       __ BIND(L_Rounds);
4759 
4760       chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
4761       chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
4762       chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
4763       chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
4764 
4765       chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
4766       chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
4767       chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
4768       chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
4769 
4770       __ sub(loop, loop, 1);
4771       __ bnez(loop, L_Rounds);
4772     }
4773 
4774     // Add the original state into the end working state.
4775     // We do this by first duplicating every element in source state array to the corresponding
4776     // vector, then adding it to the post-loop working state.
4777     __ mv(tmp_addr, state);
4778     for (int i = 0; i < states_len; i += 1) {
4779       __ vlse32_v(tmp_vr, tmp_addr, zr);
4780       __ addi(tmp_addr, tmp_addr, step);
4781       __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
4782     }
4783     // Add the counter overlay onto work_vrs[12] at the end.
4784     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4785 
4786     // Store result to key stream.
4787     {
4788       const Register stride = t2; // share t2 with other non-overlapping usages.
4789       // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
4790       __ mv(stride, 64);
4791       for (int i = 0; i < states_len; i += 1) {
4792         __ vsse32_v(work_vrs[i], key_stream, stride);
4793         __ addi(key_stream, key_stream, step);
4794       }
4795     }
4796 
4797     // Return length of output key_stream
4798     __ slli(c_rarg0, length, 6);
4799 
4800     __ leave();
4801     __ ret();
4802 
4803     return (address) start;
4804   }
4805 
4806 
4807   // ------------------------ SHA-1 intrinsic ------------------------
4808 
4809   // K't =
4810   //    5a827999, 0  <= t <= 19
4811   //    6ed9eba1, 20 <= t <= 39
4812   //    8f1bbcdc, 40 <= t <= 59
4813   //    ca62c1d6, 60 <= t <= 79
4814   void sha1_prepare_k(Register cur_k, int round) {
4815     assert(round >= 0 && round < 80, "must be");
4816 
4817     static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
4818     if ((round % 20) == 0) {
4819       __ mv(cur_k, ks[round/20]);
4820     }
4821   }
4822 
4823   // W't =
4824   //    M't,                                      0 <=  t <= 15
4825   //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4826   void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
4827     assert(round >= 0 && round < 80, "must be");
4828 
4829     if (round < 16) {
4830       // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
4831       //   in ws[0], high part contains W't-0, low part contains W't-1,
4832       //   in ws[1], high part contains W't-2, low part contains W't-3,
4833       //   ...
4834       //   in ws[7], high part contains W't-14, low part contains W't-15.
4835 
4836       if ((round % 2) == 0) {
4837         __ ld(ws[round/2], Address(buf, (round/2) * 8));
4838         // reverse bytes, as SHA-1 is defined in big-endian.
4839         __ revb(ws[round/2], ws[round/2]);
4840         __ srli(cur_w, ws[round/2], 32);
4841       } else {
4842         __ mv(cur_w, ws[round/2]);
4843       }
4844 
4845       return;
4846     }
4847 
4848     if ((round % 2) == 0) {
4849       int idx = 16;
4850       // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4851       __ srli(t1, ws[(idx-8)/2], 32);
4852       __ xorr(t0, ws[(idx-3)/2], t1);
4853 
4854       __ srli(t1, ws[(idx-14)/2], 32);
4855       __ srli(cur_w, ws[(idx-16)/2], 32);
4856       __ xorr(cur_w, cur_w, t1);
4857 
4858       __ xorr(cur_w, cur_w, t0);
4859       __ rolw_imm(cur_w, cur_w, 1, t0);
4860 
4861       // copy the cur_w value to ws[8].
4862       // now, valid w't values are at:
4863       //  w0:       ws[0]'s lower 32 bits
4864       //  w1 ~ w14: ws[1] ~ ws[7]
4865       //  w15:      ws[8]'s higher 32 bits
4866       __ slli(ws[idx/2], cur_w, 32);
4867 
4868       return;
4869     }
4870 
4871     int idx = 17;
4872     // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4873     __ srli(t1, ws[(idx-3)/2], 32);
4874     __ xorr(t0, t1, ws[(idx-8)/2]);
4875 
4876     __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
4877 
4878     __ xorr(cur_w, cur_w, t0);
4879     __ rolw_imm(cur_w, cur_w, 1, t0);
4880 
4881     // copy the cur_w value to ws[8]
4882     __ zero_extend(cur_w, cur_w, 32);
4883     __ orr(ws[idx/2], ws[idx/2], cur_w);
4884 
4885     // shift the w't registers, so they start from ws[0] again.
4886     // now, valid w't values are at:
4887     //  w0 ~ w15: ws[0] ~ ws[7]
4888     Register ws_0 = ws[0];
4889     for (int i = 0; i < 16/2; i++) {
4890       ws[i] = ws[i+1];
4891     }
4892     ws[8] = ws_0;
4893   }
4894 
4895   // f't(x, y, z) =
4896   //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
4897   //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
4898   //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
4899   //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
4900   void sha1_f(Register dst, Register x, Register y, Register z, int round) {
4901     assert(round >= 0 && round < 80, "must be");
4902     assert_different_registers(dst, x, y, z, t0, t1);
4903 
4904     if (round < 20) {
4905       // (x & y) ^ (~x & z)
4906       __ andr(t0, x, y);
4907       __ andn(dst, z, x);
4908       __ xorr(dst, dst, t0);
4909     } else if (round >= 40 && round < 60) {
4910       // (x & y) ^ (x & z) ^ (y & z)
4911       __ andr(t0, x, y);
4912       __ andr(t1, x, z);
4913       __ andr(dst, y, z);
4914       __ xorr(dst, dst, t0);
4915       __ xorr(dst, dst, t1);
4916     } else {
4917       // x ^ y ^ z
4918       __ xorr(dst, x, y);
4919       __ xorr(dst, dst, z);
4920     }
4921   }
4922 
4923   // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4924   // e = d
4925   // d = c
4926   // c = ROTL'30(b)
4927   // b = a
4928   // a = T
4929   void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
4930                           Register cur_k, Register cur_w, Register tmp, int round) {
4931     assert(round >= 0 && round < 80, "must be");
4932     assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
4933 
4934     // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4935 
4936     // cur_w will be recalculated at the beginning of each round,
4937     // so, we can reuse it as a temp register here.
4938     Register tmp2 = cur_w;
4939 
4940     // reuse e as a temporary register, as we will mv new value into it later
4941     Register tmp3 = e;
4942     __ add(tmp2, cur_k, tmp2);
4943     __ add(tmp3, tmp3, tmp2);
4944     __ rolw_imm(tmp2, a, 5, t0);
4945 
4946     sha1_f(tmp, b, c, d, round);
4947 
4948     __ add(tmp2, tmp2, tmp);
4949     __ add(tmp2, tmp2, tmp3);
4950 
4951     // e = d
4952     // d = c
4953     // c = ROTL'30(b)
4954     // b = a
4955     // a = T
4956     __ mv(e, d);
4957     __ mv(d, c);
4958 
4959     __ rolw_imm(c, b, 30);
4960     __ mv(b, a);
4961     __ mv(a, tmp2);
4962   }
4963 
4964   // H(i)0 = a + H(i-1)0
4965   // H(i)1 = b + H(i-1)1
4966   // H(i)2 = c + H(i-1)2
4967   // H(i)3 = d + H(i-1)3
4968   // H(i)4 = e + H(i-1)4
4969   void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
4970                               Register prev_ab, Register prev_cd, Register prev_e) {
4971     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
4972 
4973     __ add(a, a, prev_ab);
4974     __ srli(prev_ab, prev_ab, 32);
4975     __ add(b, b, prev_ab);
4976 
4977     __ add(c, c, prev_cd);
4978     __ srli(prev_cd, prev_cd, 32);
4979     __ add(d, d, prev_cd);
4980 
4981     __ add(e, e, prev_e);
4982   }
4983 
4984   void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
4985                                 Register prev_ab, Register prev_cd, Register prev_e) {
4986     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
4987 
4988     __ slli(t0, b, 32);
4989     __ zero_extend(prev_ab, a, 32);
4990     __ orr(prev_ab, prev_ab, t0);
4991 
4992     __ slli(t0, d, 32);
4993     __ zero_extend(prev_cd, c, 32);
4994     __ orr(prev_cd, prev_cd, t0);
4995 
4996     __ mv(prev_e, e);
4997   }
4998 
4999   // Intrinsic for:
5000   //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5001   //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5002   //
5003   // Arguments:
5004   //
5005   // Inputs:
5006   //   c_rarg0: byte[]  src array + offset
5007   //   c_rarg1: int[]   SHA.state
5008   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5009   //   c_rarg2: int     offset
5010   //   c_rarg3: int     limit
5011   //
5012   // Outputs:
5013   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5014   //   c_rarg0: int offset, when (multi_block == true)
5015   //
5016   address generate_sha1_implCompress(bool multi_block, const char *name) {
5017     __ align(CodeEntryAlignment);
5018     StubCodeMark mark(this, "StubRoutines", name);
5019 
5020     address start = __ pc();
5021     __ enter();
5022 
5023     RegSet saved_regs = RegSet::range(x18, x27);
5024     if (multi_block) {
5025       // use x9 as src below.
5026       saved_regs += RegSet::of(x9);
5027     }
5028     __ push_reg(saved_regs, sp);
5029 
5030     // c_rarg0 - c_rarg3: x10 - x13
5031     Register buf    = c_rarg0;
5032     Register state  = c_rarg1;
5033     Register offset = c_rarg2;
5034     Register limit  = c_rarg3;
5035     // use src to contain the original start point of the array.
5036     Register src    = x9;
5037 
5038     if (multi_block) {
5039       __ sub(limit, limit, offset);
5040       __ add(limit, limit, buf);
5041       __ sub(src, buf, offset);
5042     }
5043 
5044     // [args-reg]:  x14 - x17
5045     // [temp-reg]:  x28 - x31
5046     // [saved-reg]: x18 - x27
5047 
5048     // h0/1/2/3/4
5049     const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5050     // w0, w1, ... w15
5051     // put two adjecent w's in one register:
5052     //    one at high word part, another at low word part
5053     // at different round (even or odd), w't value reside in different items in ws[].
5054     // w0 ~ w15, either reside in
5055     //    ws[0] ~ ws[7], where
5056     //      w0 at higher 32 bits of ws[0],
5057     //      w1 at lower 32 bits of ws[0],
5058     //      ...
5059     //      w14 at higher 32 bits of ws[7],
5060     //      w15 at lower 32 bits of ws[7].
5061     // or, reside in
5062     //    w0:       ws[0]'s lower 32 bits
5063     //    w1 ~ w14: ws[1] ~ ws[7]
5064     //    w15:      ws[8]'s higher 32 bits
5065     Register ws[9] = {x29, x30, x31, x18,
5066                       x19, x20, x21, x22,
5067                       x23}; // auxiliary register for calculating w's value
5068     // current k't's value
5069     const Register cur_k = x24;
5070     // current w't's value
5071     const Register cur_w = x25;
5072     // values of a, b, c, d, e in the previous round
5073     const Register prev_ab = x26, prev_cd = x27;
5074     const Register prev_e = offset; // reuse offset/c_rarg2
5075 
5076     // load 5 words state into a, b, c, d, e.
5077     //
5078     // To minimize the number of memory operations, we apply following
5079     // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5080     // with a single ld, and split them into 2 registers.
5081     //
5082     // And, as the core algorithm of SHA-1 works on 32-bits words, so
5083     // in the following code, it does not care about the content of
5084     // higher 32-bits in a/b/c/d/e. Based on this observation,
5085     // we can apply further optimization, which is to just ignore the
5086     // higher 32-bits in a/c/e, rather than set the higher
5087     // 32-bits of a/c/e to zero explicitly with extra instructions.
5088     __ ld(a, Address(state, 0));
5089     __ srli(b, a, 32);
5090     __ ld(c, Address(state, 8));
5091     __ srli(d, c, 32);
5092     __ lw(e, Address(state, 16));
5093 
5094     Label L_sha1_loop;
5095     if (multi_block) {
5096       __ BIND(L_sha1_loop);
5097     }
5098 
5099     sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5100 
5101     for (int round = 0; round < 80; round++) {
5102       // prepare K't value
5103       sha1_prepare_k(cur_k, round);
5104 
5105       // prepare W't value
5106       sha1_prepare_w(cur_w, ws, buf, round);
5107 
5108       // one round process
5109       sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
5110     }
5111 
5112     // compute the intermediate hash value
5113     sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5114 
5115     if (multi_block) {
5116       int64_t block_bytes = 16 * 4;
5117       __ addi(buf, buf, block_bytes);
5118 
5119       __ bge(limit, buf, L_sha1_loop, true);
5120     }
5121 
5122     // store back the state.
5123     __ zero_extend(a, a, 32);
5124     __ slli(b, b, 32);
5125     __ orr(a, a, b);
5126     __ sd(a, Address(state, 0));
5127     __ zero_extend(c, c, 32);
5128     __ slli(d, d, 32);
5129     __ orr(c, c, d);
5130     __ sd(c, Address(state, 8));
5131     __ sw(e, Address(state, 16));
5132 
5133     // return offset
5134     if (multi_block) {
5135       __ sub(c_rarg0, buf, src);
5136     }
5137 
5138     __ pop_reg(saved_regs, sp);
5139 
5140     __ leave();
5141     __ ret();
5142 
5143     return (address) start;
5144   }
5145 
5146   /**
5147    * vector registers:
5148    *   input VectorRegister's:  intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
5149    *   index VectorRegister's:  idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
5150    *   output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
5151    *
5152    * NOTE: each field will occupy a vector register group
5153    */
5154   void base64_vector_encode_round(Register src, Register dst, Register codec,
5155                     Register size, Register stepSrc, Register stepDst,
5156                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
5157                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5158                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
5159                     Assembler::LMUL lmul) {
5160     // set vector register type/len
5161     __ vsetvli(x0, size, Assembler::e8, lmul);
5162 
5163     // segmented load src into v registers: mem(src) => vr(3)
5164     __ vlseg3e8_v(inputV1, src);
5165 
5166     // src = src + register_group_len_bytes * 3
5167     __ add(src, src, stepSrc);
5168 
5169     // encoding
5170     //   1. compute index into lookup table: vr(3) => vr(4)
5171     __ vsrl_vi(idxV1, inputV1, 2);
5172 
5173     __ vsrl_vi(idxV2, inputV2, 2);
5174     __ vsll_vi(inputV1, inputV1, 6);
5175     __ vor_vv(idxV2, idxV2, inputV1);
5176     __ vsrl_vi(idxV2, idxV2, 2);
5177 
5178     __ vsrl_vi(idxV3, inputV3, 4);
5179     __ vsll_vi(inputV2, inputV2, 4);
5180     __ vor_vv(idxV3, inputV2, idxV3);
5181     __ vsrl_vi(idxV3, idxV3, 2);
5182 
5183     __ vsll_vi(idxV4, inputV3, 2);
5184     __ vsrl_vi(idxV4, idxV4, 2);
5185 
5186     //   2. indexed load: vr(4) => vr(4)
5187     __ vluxei8_v(outputV1, codec, idxV1);
5188     __ vluxei8_v(outputV2, codec, idxV2);
5189     __ vluxei8_v(outputV3, codec, idxV3);
5190     __ vluxei8_v(outputV4, codec, idxV4);
5191 
5192     // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
5193     __ vsseg4e8_v(outputV1, dst);
5194 
5195     // dst = dst + register_group_len_bytes * 4
5196     __ add(dst, dst, stepDst);
5197   }
5198 
5199   /**
5200    *  void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
5201    *
5202    *  Input arguments:
5203    *  c_rarg0   - src, source array
5204    *  c_rarg1   - sp, src start offset
5205    *  c_rarg2   - sl, src end offset
5206    *  c_rarg3   - dst, dest array
5207    *  c_rarg4   - dp, dst start offset
5208    *  c_rarg5   - isURL, Base64 or URL character set
5209    */
5210   address generate_base64_encodeBlock() {
5211     alignas(64) static const char toBase64[64] = {
5212       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5213       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5214       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5215       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5216       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5217     };
5218 
5219     alignas(64) static const char toBase64URL[64] = {
5220       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5221       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5222       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5223       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5224       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5225     };
5226 
5227     __ align(CodeEntryAlignment);
5228     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5229     address start = __ pc();
5230     __ enter();
5231 
5232     Register src    = c_rarg0;
5233     Register soff   = c_rarg1;
5234     Register send   = c_rarg2;
5235     Register dst    = c_rarg3;
5236     Register doff   = c_rarg4;
5237     Register isURL  = c_rarg5;
5238 
5239     Register codec  = c_rarg6;
5240     Register length = c_rarg7; // total length of src data in bytes
5241 
5242     Label ProcessData, Exit;
5243 
5244     // length should be multiple of 3
5245     __ sub(length, send, soff);
5246     // real src/dst to process data
5247     __ add(src, src, soff);
5248     __ add(dst, dst, doff);
5249 
5250     // load the codec base address
5251     __ la(codec, ExternalAddress((address) toBase64));
5252     __ beqz(isURL, ProcessData);
5253     __ la(codec, ExternalAddress((address) toBase64URL));
5254     __ BIND(ProcessData);
5255 
5256     // vector version
5257     if (UseRVV) {
5258       Label ProcessM2, ProcessM1, ProcessScalar;
5259 
5260       Register size      = soff;
5261       Register stepSrcM1 = send;
5262       Register stepSrcM2 = doff;
5263       Register stepDst   = isURL;
5264 
5265       __ mv(size, MaxVectorSize * 2);
5266       __ mv(stepSrcM1, MaxVectorSize * 3);
5267       __ slli(stepSrcM2, stepSrcM1, 1);
5268       __ mv(stepDst, MaxVectorSize * 2 * 4);
5269 
5270       __ blt(length, stepSrcM2, ProcessM1);
5271 
5272       __ BIND(ProcessM2);
5273       base64_vector_encode_round(src, dst, codec,
5274                     size, stepSrcM2, stepDst,
5275                     v2, v4, v6,         // inputs
5276                     v8, v10, v12, v14,  // indexes
5277                     v16, v18, v20, v22, // outputs
5278                     Assembler::m2);
5279 
5280       __ sub(length, length, stepSrcM2);
5281       __ bge(length, stepSrcM2, ProcessM2);
5282 
5283       __ BIND(ProcessM1);
5284       __ blt(length, stepSrcM1, ProcessScalar);
5285 
5286       __ srli(size, size, 1);
5287       __ srli(stepDst, stepDst, 1);
5288       base64_vector_encode_round(src, dst, codec,
5289                     size, stepSrcM1, stepDst,
5290                     v1, v2, v3,         // inputs
5291                     v4, v5, v6, v7,     // indexes
5292                     v8, v9, v10, v11,   // outputs
5293                     Assembler::m1);
5294       __ sub(length, length, stepSrcM1);
5295 
5296       __ BIND(ProcessScalar);
5297     }
5298 
5299     // scalar version
5300     {
5301       Register byte1 = soff, byte0 = send, byte2 = doff;
5302       Register combined24Bits = isURL;
5303 
5304       __ beqz(length, Exit);
5305 
5306       Label ScalarLoop;
5307       __ BIND(ScalarLoop);
5308       {
5309         // plain:   [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
5310         // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
5311 
5312         // load 3 bytes src data
5313         __ lbu(byte0, Address(src, 0));
5314         __ lbu(byte1, Address(src, 1));
5315         __ lbu(byte2, Address(src, 2));
5316         __ addi(src, src, 3);
5317 
5318         // construct 24 bits from 3 bytes
5319         __ slliw(byte0, byte0, 16);
5320         __ slliw(byte1, byte1, 8);
5321         __ orr(combined24Bits, byte0, byte1);
5322         __ orr(combined24Bits, combined24Bits, byte2);
5323 
5324         // get codec index and encode(ie. load from codec by index)
5325         __ slliw(byte0, combined24Bits, 8);
5326         __ srliw(byte0, byte0, 26);
5327         __ add(byte0, codec, byte0);
5328         __ lbu(byte0, byte0);
5329 
5330         __ slliw(byte1, combined24Bits, 14);
5331         __ srliw(byte1, byte1, 26);
5332         __ add(byte1, codec, byte1);
5333         __ lbu(byte1, byte1);
5334 
5335         __ slliw(byte2, combined24Bits, 20);
5336         __ srliw(byte2, byte2, 26);
5337         __ add(byte2, codec, byte2);
5338         __ lbu(byte2, byte2);
5339 
5340         __ andi(combined24Bits, combined24Bits, 0x3f);
5341         __ add(combined24Bits, codec, combined24Bits);
5342         __ lbu(combined24Bits, combined24Bits);
5343 
5344         // store 4 bytes encoded data
5345         __ sb(byte0, Address(dst, 0));
5346         __ sb(byte1, Address(dst, 1));
5347         __ sb(byte2, Address(dst, 2));
5348         __ sb(combined24Bits, Address(dst, 3));
5349 
5350         __ sub(length, length, 3);
5351         __ addi(dst, dst, 4);
5352         // loop back
5353         __ bnez(length, ScalarLoop);
5354       }
5355     }
5356 
5357     __ BIND(Exit);
5358 
5359     __ leave();
5360     __ ret();
5361 
5362     return (address) start;
5363   }
5364 
5365   /**
5366    * vector registers:
5367    * input VectorRegister's:  intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
5368    * index VectorRegister's:  idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
5369    * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
5370    *
5371    * NOTE: each field will occupy a single vector register group
5372    */
5373   void base64_vector_decode_round(Register src, Register dst, Register codec,
5374                     Register size, Register stepSrc, Register stepDst, Register failedIdx,
5375                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
5376                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5377                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
5378                     Assembler::LMUL lmul) {
5379     // set vector register type/len
5380     __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
5381 
5382     // segmented load src into v registers: mem(src) => vr(4)
5383     __ vlseg4e8_v(inputV1, src);
5384 
5385     // src = src + register_group_len_bytes * 4
5386     __ add(src, src, stepSrc);
5387 
5388     // decoding
5389     //   1. indexed load: vr(4) => vr(4)
5390     __ vluxei8_v(idxV1, codec, inputV1);
5391     __ vluxei8_v(idxV2, codec, inputV2);
5392     __ vluxei8_v(idxV3, codec, inputV3);
5393     __ vluxei8_v(idxV4, codec, inputV4);
5394 
5395     //   2. check wrong data
5396     __ vor_vv(outputV1, idxV1, idxV2);
5397     __ vor_vv(outputV2, idxV3, idxV4);
5398     __ vor_vv(outputV1, outputV1, outputV2);
5399     __ vmseq_vi(v0, outputV1, -1);
5400     __ vfirst_m(failedIdx, v0);
5401     Label NoFailure, FailureAtIdx0;
5402     // valid value can only be -1 when < 0
5403     __ bltz(failedIdx, NoFailure);
5404     // when the first data (at index 0) fails, no need to process data anymore
5405     __ beqz(failedIdx, FailureAtIdx0);
5406     __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
5407     __ slli(stepDst, failedIdx, 1);
5408     __ add(stepDst, failedIdx, stepDst);
5409     __ BIND(NoFailure);
5410 
5411     //   3. compute the decoded data: vr(4) => vr(3)
5412     __ vsll_vi(idxV1, idxV1, 2);
5413     __ vsrl_vi(outputV1, idxV2, 4);
5414     __ vor_vv(outputV1, outputV1, idxV1);
5415 
5416     __ vsll_vi(idxV2, idxV2, 4);
5417     __ vsrl_vi(outputV2, idxV3, 2);
5418     __ vor_vv(outputV2, outputV2, idxV2);
5419 
5420     __ vsll_vi(idxV3, idxV3, 6);
5421     __ vor_vv(outputV3, idxV4, idxV3);
5422 
5423     // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
5424     __ vsseg3e8_v(outputV1, dst);
5425 
5426     // dst = dst + register_group_len_bytes * 3
5427     __ add(dst, dst, stepDst);
5428     __ BIND(FailureAtIdx0);
5429   }
5430 
5431   /**
5432    * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
5433    *
5434    *  Input arguments:
5435    *  c_rarg0   - src, source array
5436    *  c_rarg1   - sp, src start offset
5437    *  c_rarg2   - sl, src end offset
5438    *  c_rarg3   - dst, dest array
5439    *  c_rarg4   - dp, dst start offset
5440    *  c_rarg5   - isURL, Base64 or URL character set
5441    *  c_rarg6   - isMIME, Decoding MIME block
5442    */
5443   address generate_base64_decodeBlock() {
5444 
5445     static const uint8_t fromBase64[256] = {
5446         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5447         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5448         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5449         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5450         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5451         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
5452         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5453         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5454         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5455         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5456         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5457         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5458         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5459         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5460         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5461         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5462     };
5463 
5464     static const uint8_t fromBase64URL[256] = {
5465         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5466         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5467         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5468         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5469         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5470         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
5471         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5472         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5473         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5474         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5475         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5476         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5477         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5478         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5479         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5480         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5481     };
5482 
5483     __ align(CodeEntryAlignment);
5484     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
5485     address start = __ pc();
5486     __ enter();
5487 
5488     Register src    = c_rarg0;
5489     Register soff   = c_rarg1;
5490     Register send   = c_rarg2;
5491     Register dst    = c_rarg3;
5492     Register doff   = c_rarg4;
5493     Register isURL  = c_rarg5;
5494     Register isMIME = c_rarg6;
5495 
5496     Register codec     = c_rarg7;
5497     Register dstBackup = x31;
5498     Register length    = x28;     // t3, total length of src data in bytes
5499 
5500     Label ProcessData, Exit;
5501     Label ProcessScalar, ScalarLoop;
5502 
5503     // passed in length (send - soff) is guaranteed to be > 4,
5504     // and in this intrinsic we only process data of length in multiple of 4,
5505     // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
5506     __ sub(length, send, soff);
5507     __ andi(length, length, -4);
5508     // real src/dst to process data
5509     __ add(src, src, soff);
5510     __ add(dst, dst, doff);
5511     // backup of dst, used to calculate the return value at exit
5512     __ mv(dstBackup, dst);
5513 
5514     // load the codec base address
5515     __ la(codec, ExternalAddress((address) fromBase64));
5516     __ beqz(isURL, ProcessData);
5517     __ la(codec, ExternalAddress((address) fromBase64URL));
5518     __ BIND(ProcessData);
5519 
5520     // vector version
5521     if (UseRVV) {
5522       // for MIME case, it has a default length limit of 76 which could be
5523       // different(smaller) from (send - soff), so in MIME case, we go through
5524       // the scalar code path directly.
5525       __ bnez(isMIME, ScalarLoop);
5526 
5527       Label ProcessM1, ProcessM2;
5528 
5529       Register failedIdx = soff;
5530       Register stepSrcM1 = send;
5531       Register stepSrcM2 = doff;
5532       Register stepDst   = isURL;
5533       Register size      = x29;   // t4
5534 
5535       __ mv(size, MaxVectorSize * 2);
5536       __ mv(stepSrcM1, MaxVectorSize * 4);
5537       __ slli(stepSrcM2, stepSrcM1, 1);
5538       __ mv(stepDst, MaxVectorSize * 2 * 3);
5539 
5540       __ blt(length, stepSrcM2, ProcessM1);
5541 
5542 
5543       // Assembler::m2
5544       __ BIND(ProcessM2);
5545       base64_vector_decode_round(src, dst, codec,
5546                     size, stepSrcM2, stepDst, failedIdx,
5547                     v2, v4, v6, v8,      // inputs
5548                     v10, v12, v14, v16,  // indexes
5549                     v18, v20, v22,       // outputs
5550                     Assembler::m2);
5551       __ sub(length, length, stepSrcM2);
5552 
5553       // error check
5554       // valid value of failedIdx can only be -1 when < 0
5555       __ bgez(failedIdx, Exit);
5556 
5557       __ bge(length, stepSrcM2, ProcessM2);
5558 
5559 
5560       // Assembler::m1
5561       __ BIND(ProcessM1);
5562       __ blt(length, stepSrcM1, ProcessScalar);
5563 
5564       __ srli(size, size, 1);
5565       __ srli(stepDst, stepDst, 1);
5566       base64_vector_decode_round(src, dst, codec,
5567                     size, stepSrcM1, stepDst, failedIdx,
5568                     v1, v2, v3, v4,      // inputs
5569                     v5, v6, v7, v8,      // indexes
5570                     v9, v10, v11,        // outputs
5571                     Assembler::m1);
5572       __ sub(length, length, stepSrcM1);
5573 
5574       // error check
5575       // valid value of failedIdx can only be -1 when < 0
5576       __ bgez(failedIdx, Exit);
5577 
5578       __ BIND(ProcessScalar);
5579       __ beqz(length, Exit);
5580     }
5581 
5582     // scalar version
5583     {
5584       Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
5585       Register combined32Bits = x29; // t5
5586 
5587       // encoded:   [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
5588       // plain:     [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
5589       __ BIND(ScalarLoop);
5590 
5591       // load 4 bytes encoded src data
5592       __ lbu(byte0, Address(src, 0));
5593       __ lbu(byte1, Address(src, 1));
5594       __ lbu(byte2, Address(src, 2));
5595       __ lbu(byte3, Address(src, 3));
5596       __ addi(src, src, 4);
5597 
5598       // get codec index and decode (ie. load from codec by index)
5599       __ add(byte0, codec, byte0);
5600       __ add(byte1, codec, byte1);
5601       __ lb(byte0, Address(byte0, 0));
5602       __ lb(byte1, Address(byte1, 0));
5603       __ add(byte2, codec, byte2);
5604       __ add(byte3, codec, byte3);
5605       __ lb(byte2, Address(byte2, 0));
5606       __ lb(byte3, Address(byte3, 0));
5607       __ slliw(byte0, byte0, 18);
5608       __ slliw(byte1, byte1, 12);
5609       __ orr(byte0, byte0, byte1);
5610       __ orr(byte0, byte0, byte3);
5611       __ slliw(byte2, byte2, 6);
5612       // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
5613       //  1. error check below
5614       //  2. decode below
5615       __ orr(combined32Bits, byte0, byte2);
5616 
5617       // error check
5618       __ bltz(combined32Bits, Exit);
5619 
5620       // store 3 bytes decoded data
5621       __ sraiw(byte0, combined32Bits, 16);
5622       __ sraiw(byte1, combined32Bits, 8);
5623       __ sb(byte0, Address(dst, 0));
5624       __ sb(byte1, Address(dst, 1));
5625       __ sb(combined32Bits, Address(dst, 2));
5626 
5627       __ sub(length, length, 4);
5628       __ addi(dst, dst, 3);
5629       // loop back
5630       __ bnez(length, ScalarLoop);
5631     }
5632 
5633     __ BIND(Exit);
5634     __ sub(c_rarg0, dst, dstBackup);
5635 
5636     __ leave();
5637     __ ret();
5638 
5639     return (address) start;
5640   }
5641 
5642   void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
5643     VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
5644     Register temp0, Register temp1, Register temp2,  Register temp3,
5645     VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
5646 
5647     assert((lmul == Assembler::m4 && step == 64) ||
5648            (lmul == Assembler::m2 && step == 32) ||
5649            (lmul == Assembler::m1 && step == 16),
5650            "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
5651     // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
5652     // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
5653     // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
5654     // In non-vectorized code, we update s1 and s2 as:
5655     //   s1 <- s1 + b1
5656     //   s2 <- s2 + s1
5657     //   s1 <- s1 + b2
5658     //   s2 <- s2 + b1
5659     //   ...
5660     //   s1 <- s1 + b64
5661     //   s2 <- s2 + s1
5662     // Putting above assignments together, we have:
5663     //   s1_new = s1 + b1 + b2 + ... + b64
5664     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
5665     //          = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
5666     //          = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
5667 
5668     __ mv(temp3, step);
5669     // Load data
5670     __ vsetvli(temp0, temp3, Assembler::e8, lmul);
5671     __ vle8_v(vbytes, buff);
5672     __ addi(buff, buff, step);
5673 
5674     // Upper bound reduction sum for s1_new:
5675     // 0xFF * 64 = 0x3FC0, so:
5676     // 1. Need to do vector-widening reduction sum
5677     // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
5678     __ vwredsumu_vs(vs1acc, vbytes, vzero);
5679     // Multiplication for s2_new
5680     __ vwmulu_vv(vs2acc, vtable, vbytes);
5681 
5682     // s2 = s2 + s1 * log2(step)
5683     __ slli(temp1, s1, exact_log2(step));
5684     __ add(s2, s2, temp1);
5685 
5686     // Summing up calculated results for s2_new
5687     if (MaxVectorSize > 16) {
5688       __ vsetvli(temp0, temp3, Assembler::e16, lmul);
5689     } else {
5690       // Half of vector-widening multiplication result is in successor of vs2acc
5691       // group for vlen == 16, in which case we need to double vector register
5692       // group width in order to reduction sum all of them
5693       Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
5694                                (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
5695       __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
5696     }
5697     // Upper bound for reduction sum:
5698     // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
5699     // 1. Need to do vector-widening reduction sum
5700     // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
5701     __ vwredsumu_vs(vtemp1, vs2acc, vzero);
5702 
5703     // Extracting results for:
5704     // s1_new
5705     __ vmv_x_s(temp0, vs1acc);
5706     __ add(s1, s1, temp0);
5707     // s2_new
5708     __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
5709     __ vmv_x_s(temp1, vtemp1);
5710     __ add(s2, s2, temp1);
5711   }
5712 
5713   /***
5714    *  int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
5715    *
5716    *  Arguments:
5717    *
5718    *  Inputs:
5719    *   c_rarg0   - int   adler
5720    *   c_rarg1   - byte* buff (b + off)
5721    *   c_rarg2   - int   len
5722    *
5723    *  Output:
5724    *   c_rarg0   - int adler result
5725    */
5726   address generate_updateBytesAdler32() {
5727     __ align(CodeEntryAlignment);
5728     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
5729     address start = __ pc();
5730 
5731     Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
5732       L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
5733 
5734     // Aliases
5735     Register adler  = c_rarg0;
5736     Register s1     = c_rarg0;
5737     Register s2     = c_rarg3;
5738     Register buff   = c_rarg1;
5739     Register len    = c_rarg2;
5740     Register nmax  = c_rarg4;
5741     Register base  = c_rarg5;
5742     Register count = c_rarg6;
5743     Register temp0 = x28; // t3
5744     Register temp1 = x29; // t4
5745     Register temp2 = x30; // t5
5746     Register temp3 = x31; // t6
5747 
5748     VectorRegister vzero = v31;
5749     VectorRegister vbytes = v8; // group: v8, v9, v10, v11
5750     VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
5751     VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
5752     VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
5753     VectorRegister vtable_32 = v4; // group: v4, v5
5754     VectorRegister vtable_16 = v30;
5755     VectorRegister vtemp1 = v28;
5756     VectorRegister vtemp2 = v29;
5757 
5758     // Max number of bytes we can process before having to take the mod
5759     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
5760     const uint64_t BASE = 0xfff1;
5761     const uint64_t NMAX = 0x15B0;
5762 
5763     // Loops steps
5764     int step_64 = 64;
5765     int step_32 = 32;
5766     int step_16 = 16;
5767     int step_1  = 1;
5768 
5769     __ enter(); // Required for proper stackwalking of RuntimeStub frame
5770     __ mv(temp1, 64);
5771     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
5772 
5773     // Generating accumulation coefficients for further calculations
5774     // vtable_64:
5775     __ vid_v(vtemp1);
5776     __ vrsub_vx(vtable_64, vtemp1, temp1);
5777     // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
5778 
5779     // vtable_32:
5780     __ mv(temp1, 32);
5781     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
5782     __ vid_v(vtemp1);
5783     __ vrsub_vx(vtable_32, vtemp1, temp1);
5784     // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
5785 
5786     __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
5787     // vtable_16:
5788     __ mv(temp1, 16);
5789     __ vid_v(vtemp1);
5790     __ vrsub_vx(vtable_16, vtemp1, temp1);
5791     // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
5792 
5793     __ vmv_v_i(vzero, 0);
5794 
5795     __ mv(base, BASE);
5796     __ mv(nmax, NMAX);
5797 
5798     // s1 is initialized to the lower 16 bits of adler
5799     // s2 is initialized to the upper 16 bits of adler
5800     __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
5801     __ zero_extend(s1, adler, 16); // s1 = (adler & 0xffff)
5802 
5803     // The pipelined loop needs at least 16 elements for 1 iteration
5804     // It does check this, but it is more effective to skip to the cleanup loop
5805     __ mv(temp0, step_16);
5806     __ bgeu(len, temp0, L_nmax);
5807     __ beqz(len, L_combine);
5808 
5809     // Jumping to L_by1_loop
5810     __ sub(len, len, step_1);
5811     __ j(L_by1_loop);
5812 
5813   __ bind(L_nmax);
5814     __ sub(len, len, nmax);
5815     __ sub(count, nmax, 16);
5816     __ bltz(len, L_by16);
5817 
5818   // Align L_nmax loop by 64
5819   __ bind(L_nmax_loop_entry);
5820     __ sub(count, count, 32);
5821 
5822   __ bind(L_nmax_loop);
5823     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
5824       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5825       vtemp1, vtemp2, step_64, Assembler::m4);
5826     __ sub(count, count, step_64);
5827     __ bgtz(count, L_nmax_loop);
5828 
5829     // There are three iterations left to do
5830     adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
5831       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5832       vtemp1, vtemp2, step_32, Assembler::m2);
5833     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
5834       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5835       vtemp1, vtemp2, step_16, Assembler::m1);
5836 
5837     // s1 = s1 % BASE
5838     __ remuw(s1, s1, base);
5839     // s2 = s2 % BASE
5840     __ remuw(s2, s2, base);
5841 
5842     __ sub(len, len, nmax);
5843     __ sub(count, nmax, 16);
5844     __ bgez(len, L_nmax_loop_entry);
5845 
5846   __ bind(L_by16);
5847     __ add(len, len, count);
5848     __ bltz(len, L_by1);
5849     // Trying to unroll
5850     __ mv(temp3, step_64);
5851     __ blt(len, temp3, L_by16_loop);
5852 
5853   __ bind(L_by16_loop_unroll);
5854     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
5855       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5856       vtemp1, vtemp2, step_64, Assembler::m4);
5857     __ sub(len, len, step_64);
5858     // By now the temp3 should still be 64
5859     __ bge(len, temp3, L_by16_loop_unroll);
5860 
5861   __ bind(L_by16_loop);
5862     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
5863       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5864       vtemp1, vtemp2, step_16, Assembler::m1);
5865     __ sub(len, len, step_16);
5866     __ bgez(len, L_by16_loop);
5867 
5868   __ bind(L_by1);
5869     __ add(len, len, 15);
5870     __ bltz(len, L_do_mod);
5871 
5872   __ bind(L_by1_loop);
5873     __ lbu(temp0, Address(buff, 0));
5874     __ addi(buff, buff, step_1);
5875     __ add(s1, temp0, s1);
5876     __ add(s2, s2, s1);
5877     __ sub(len, len, step_1);
5878     __ bgez(len, L_by1_loop);
5879 
5880   __ bind(L_do_mod);
5881     // s1 = s1 % BASE
5882     __ remuw(s1, s1, base);
5883     // s2 = s2 % BASE
5884     __ remuw(s2, s2, base);
5885 
5886     // Combine lower bits and higher bits
5887     // adler = s1 | (s2 << 16)
5888   __ bind(L_combine);
5889     __ slli(s2, s2, 16);
5890     __ orr(s1, s1, s2);
5891 
5892     __ leave(); // Required for proper stackwalking of RuntimeStub frame
5893     __ ret();
5894 
5895     return start;
5896   }
5897 
5898 #endif // COMPILER2_OR_JVMCI
5899 
5900 #ifdef COMPILER2
5901 
5902 static const int64_t right_2_bits = right_n_bits(2);
5903 static const int64_t right_3_bits = right_n_bits(3);
5904 
5905   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
5906   // are represented as long[5], with BITS_PER_LIMB = 26.
5907   // Pack five 26-bit limbs into three 64-bit registers.
5908   void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
5909     assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
5910 
5911     // The goal is to have 128-bit value in dest2:dest1:dest0
5912     __ ld(dest0, Address(src, 0));    // 26 bits in dest0
5913 
5914     __ ld(tmp1, Address(src, sizeof(jlong)));
5915     __ slli(tmp1, tmp1, 26);
5916     __ add(dest0, dest0, tmp1);       // 52 bits in dest0
5917 
5918     __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
5919     __ slli(tmp1, tmp2, 52);
5920     __ add(dest0, dest0, tmp1);       // dest0 is full
5921 
5922     __ srli(dest1, tmp2, 12);         // 14-bit in dest1
5923 
5924     __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
5925     __ slli(tmp1, tmp1, 14);
5926     __ add(dest1, dest1, tmp1);       // 40-bit in dest1
5927 
5928     __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
5929     __ slli(tmp2, tmp1, 40);
5930     __ add(dest1, dest1, tmp2);       // dest1 is full
5931 
5932     if (dest2->is_valid()) {
5933       __ srli(tmp1, tmp1, 24);
5934       __ mv(dest2, tmp1);               // 2 bits in dest2
5935     } else {
5936 #ifdef ASSERT
5937       Label OK;
5938       __ srli(tmp1, tmp1, 24);
5939       __ beq(zr, tmp1, OK);           // 2 bits
5940       __ stop("high bits of Poly1305 integer should be zero");
5941       __ should_not_reach_here();
5942       __ bind(OK);
5943 #endif
5944     }
5945   }
5946 
5947   // As above, but return only a 128-bit integer, packed into two
5948   // 64-bit registers.
5949   void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
5950     poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
5951   }
5952 
5953   // U_2:U_1:U_0: += (U_2 >> 2) * 5
5954   void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
5955     assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
5956 
5957     // First, U_2:U_1:U_0 += (U_2 >> 2)
5958     __ srli(tmp1, U_2, 2);
5959     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
5960     __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
5961     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
5962     __ add(U_2, U_2, tmp2);
5963 
5964     // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
5965     __ slli(tmp1, tmp1, 2);
5966     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
5967     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
5968     __ add(U_2, U_2, tmp2);
5969   }
5970 
5971   // Poly1305, RFC 7539
5972   // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
5973 
5974   // Arguments:
5975   //    c_rarg0:   input_start -- where the input is stored
5976   //    c_rarg1:   length
5977   //    c_rarg2:   acc_start -- where the output will be stored
5978   //    c_rarg3:   r_start -- where the randomly generated 128-bit key is stored
5979 
5980   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
5981   // description of the tricks used to simplify and accelerate this
5982   // computation.
5983 
5984   address generate_poly1305_processBlocks() {
5985     __ align(CodeEntryAlignment);
5986     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
5987     address start = __ pc();
5988     __ enter();
5989     Label here;
5990 
5991     RegSet saved_regs = RegSet::range(x18, x21);
5992     RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
5993     __ push_reg(saved_regs, sp);
5994 
5995     // Arguments
5996     const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
5997 
5998     // R_n is the 128-bit randomly-generated key, packed into two
5999     // registers. The caller passes this key to us as long[5], with
6000     // BITS_PER_LIMB = 26.
6001     const Register R_0 = *regs, R_1 = *++regs;
6002     poly1305_pack_26(R_0, R_1, r_start, t1, t2);
6003 
6004     // RR_n is (R_n >> 2) * 5
6005     const Register RR_0 = *++regs, RR_1 = *++regs;
6006     __ srli(t1, R_0, 2);
6007     __ shadd(RR_0, t1, t1, t2, 2);
6008     __ srli(t1, R_1, 2);
6009     __ shadd(RR_1, t1, t1, t2, 2);
6010 
6011     // U_n is the current checksum
6012     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6013     poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6014 
6015     static constexpr int BLOCK_LENGTH = 16;
6016     Label DONE, LOOP;
6017 
6018     __ mv(t1, BLOCK_LENGTH);
6019     __ blt(length, t1, DONE); {
6020       __ bind(LOOP);
6021 
6022       // S_n is to be the sum of U_n and the next block of data
6023       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
6024       __ ld(S_0, Address(input_start, 0));
6025       __ ld(S_1, Address(input_start, wordSize));
6026 
6027       __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
6028       __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
6029       __ add(S_2, U_2, t1);
6030 
6031       __ addi(S_2, S_2, 1);
6032 
6033       const Register U_0HI = *++regs, U_1HI = *++regs;
6034 
6035       // NB: this logic depends on some of the special properties of
6036       // Poly1305 keys. In particular, because we know that the top
6037       // four bits of R_0 and R_1 are zero, we can add together
6038       // partial products without any risk of needing to propagate a
6039       // carry out.
6040       __ wide_mul(U_0, U_0HI, S_0, R_0);
6041       __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
6042       __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
6043 
6044       __ wide_mul(U_1, U_1HI, S_0, R_1);
6045       __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
6046       __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
6047 
6048       __ andi(U_2, R_0, right_2_bits);
6049       __ mul(U_2, S_2, U_2);
6050 
6051       // Partial reduction mod 2**130 - 5
6052       __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
6053       __ adc(U_2, U_2, U_1HI, t1);
6054       // Sum is now in U_2:U_1:U_0.
6055 
6056       // U_2:U_1:U_0: += (U_2 >> 2) * 5
6057       poly1305_reduce(U_2, U_1, U_0, t1, t2);
6058 
6059       __ sub(length, length, BLOCK_LENGTH);
6060       __ addi(input_start, input_start, BLOCK_LENGTH);
6061       __ mv(t1, BLOCK_LENGTH);
6062       __ bge(length, t1, LOOP);
6063     }
6064 
6065     // Further reduce modulo 2^130 - 5
6066     poly1305_reduce(U_2, U_1, U_0, t1, t2);
6067 
6068     // Unpack the sum into five 26-bit limbs and write to memory.
6069     // First 26 bits is the first limb
6070     __ slli(t1, U_0, 38); // Take lowest 26 bits
6071     __ srli(t1, t1, 38);
6072     __ sd(t1, Address(acc_start)); // First 26-bit limb
6073 
6074     // 27-52 bits of U_0 is the second limb
6075     __ slli(t1, U_0, 12); // Take next 27-52 bits
6076     __ srli(t1, t1, 38);
6077     __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
6078 
6079     // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
6080     __ srli(t1, U_0, 52);
6081     __ slli(t2, U_1, 50);
6082     __ srli(t2, t2, 38);
6083     __ add(t1, t1, t2);
6084     __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
6085 
6086     // Storing 15-40 bits of U_1
6087     __ slli(t1, U_1, 24); // Already used up 14 bits
6088     __ srli(t1, t1, 38); // Clear all other bits from t1
6089     __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
6090 
6091     // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
6092     __ srli(t1, U_1, 40);
6093     __ andi(t2, U_2, right_3_bits);
6094     __ slli(t2, t2, 24);
6095     __ add(t1, t1, t2);
6096     __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
6097 
6098     __ bind(DONE);
6099     __ pop_reg(saved_regs, sp);
6100     __ leave(); // Required for proper stackwalking
6101     __ ret();
6102 
6103     return start;
6104   }
6105 
6106 #endif // COMPILER2
6107 
6108   /**
6109    *  Arguments:
6110    *
6111    * Inputs:
6112    *   c_rarg0   - int crc
6113    *   c_rarg1   - byte* buf
6114    *   c_rarg2   - int length
6115    *
6116    * Output:
6117    *   c_rarg0   - int crc result
6118    */
6119   address generate_updateBytesCRC32() {
6120     assert(UseCRC32Intrinsics, "what are we doing here?");
6121 
6122     __ align(CodeEntryAlignment);
6123     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6124 
6125     address start = __ pc();
6126 
6127     // input parameters
6128     const Register crc    = c_rarg0;  // crc
6129     const Register buf    = c_rarg1;  // source java byte array address
6130     const Register len    = c_rarg2;  // length
6131 
6132     BLOCK_COMMENT("Entry:");
6133     __ enter(); // required for proper stackwalking of RuntimeStub frame
6134 
6135     __ kernel_crc32(crc, buf, len,
6136                     c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
6137                     c_rarg7, t2, x28, x29, x30, x31);   // misc tmps
6138 
6139     __ leave(); // required for proper stackwalking of RuntimeStub frame
6140     __ ret();
6141 
6142     return start;
6143   }
6144 
6145   // exception handler for upcall stubs
6146   address generate_upcall_stub_exception_handler() {
6147     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
6148     address start = __ pc();
6149 
6150     // Native caller has no idea how to handle exceptions,
6151     // so we just crash here. Up to callee to catch exceptions.
6152     __ verify_oop(x10); // return a exception oop in a0
6153     __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
6154     __ should_not_reach_here();
6155 
6156     return start;
6157   }
6158 
6159   // load Method* target of MethodHandle
6160   // j_rarg0 = jobject receiver
6161   // xmethod = Method* result
6162   address generate_upcall_stub_load_target() {
6163 
6164     StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target");
6165     address start = __ pc();
6166 
6167     __ resolve_global_jobject(j_rarg0, t0, t1);
6168       // Load target method from receiver
6169     __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
6170     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
6171     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
6172     __ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
6173                       Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
6174                       noreg, noreg);
6175     __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
6176 
6177     __ ret();
6178 
6179     return start;
6180   }
6181 
6182 #undef __
6183 
6184   // Initialization
6185   void generate_initial_stubs() {
6186     // Generate initial stubs and initializes the entry points
6187 
6188     // entry points that exist in all platforms Note: This is code
6189     // that could be shared among different platforms - however the
6190     // benefit seems to be smaller than the disadvantage of having a
6191     // much more complicated generator structure. See also comment in
6192     // stubRoutines.hpp.
6193 
6194     StubRoutines::_forward_exception_entry = generate_forward_exception();
6195 
6196     if (UnsafeMemoryAccess::_table == nullptr) {
6197       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
6198     }
6199 
6200     StubRoutines::_call_stub_entry =
6201       generate_call_stub(StubRoutines::_call_stub_return_address);
6202 
6203     // is referenced by megamorphic call
6204     StubRoutines::_catch_exception_entry = generate_catch_exception();
6205 
6206     if (UseCRC32Intrinsics) {
6207       // set table address before stub generation which use it
6208       StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table;
6209       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6210     }
6211   }
6212 
6213   void generate_continuation_stubs() {
6214     // Continuation stubs:
6215     StubRoutines::_cont_thaw             = generate_cont_thaw();
6216     StubRoutines::_cont_returnBarrier    = generate_cont_returnBarrier();
6217     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
6218     StubRoutines::_cont_preempt_stub     = generate_cont_preempt_stub();
6219   }
6220 
6221   void generate_final_stubs() {
6222     // support for verify_oop (must happen after universe_init)
6223     if (VerifyOops) {
6224       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6225     }
6226 
6227     // arraycopy stubs used by compilers
6228     generate_arraycopy_stubs();
6229 
6230     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6231     if (bs_nm != nullptr) {
6232       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
6233     }
6234 
6235 #ifdef COMPILER2
6236     if (UseSecondarySupersTable) {
6237       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
6238       if (!InlineSecondarySupersTest) {
6239         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
6240           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
6241             = generate_lookup_secondary_supers_table_stub(slot);
6242         }
6243       }
6244     }
6245 #endif // COMPILER2
6246 
6247     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
6248     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
6249 
6250     StubRoutines::riscv::set_completed();
6251   }
6252 
6253   void generate_compiler_stubs() {
6254 #ifdef COMPILER2
6255     if (UseMulAddIntrinsic) {
6256       StubRoutines::_mulAdd = generate_mulAdd();
6257     }
6258 
6259     if (UseMultiplyToLenIntrinsic) {
6260       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6261     }
6262 
6263     if (UseSquareToLenIntrinsic) {
6264       StubRoutines::_squareToLen = generate_squareToLen();
6265     }
6266 
6267     if (UseMontgomeryMultiplyIntrinsic) {
6268       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
6269       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
6270       StubRoutines::_montgomeryMultiply = g.generate_multiply();
6271     }
6272 
6273     if (UseMontgomerySquareIntrinsic) {
6274       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
6275       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
6276       StubRoutines::_montgomerySquare = g.generate_square();
6277     }
6278 
6279     if (UsePoly1305Intrinsics) {
6280       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
6281     }
6282 
6283     if (UseRVVForBigIntegerShiftIntrinsics) {
6284       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
6285       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
6286     }
6287 
6288     if (UseSHA256Intrinsics) {
6289       Sha2Generator sha2(_masm, this);
6290       StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(false);
6291       StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true);
6292     }
6293 
6294     if (UseSHA512Intrinsics) {
6295       Sha2Generator sha2(_masm, this);
6296       StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(false);
6297       StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true);
6298     }
6299 
6300     if (UseMD5Intrinsics) {
6301       StubRoutines::_md5_implCompress   = generate_md5_implCompress(false, "md5_implCompress");
6302       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true,  "md5_implCompressMB");
6303     }
6304 
6305     if (UseChaCha20Intrinsics) {
6306       StubRoutines::_chacha20Block = generate_chacha20Block();
6307     }
6308 
6309     if (UseSHA1Intrinsics) {
6310       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false, "sha1_implCompress");
6311       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true, "sha1_implCompressMB");
6312     }
6313 
6314     if (UseBASE64Intrinsics) {
6315       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6316       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
6317     }
6318 
6319     if (UseAdler32Intrinsics) {
6320       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6321     }
6322 
6323     generate_compare_long_strings();
6324 
6325     generate_string_indexof_stubs();
6326 
6327 #endif // COMPILER2
6328   }
6329 
6330  public:
6331   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
6332     switch(kind) {
6333     case Initial_stubs:
6334       generate_initial_stubs();
6335       break;
6336      case Continuation_stubs:
6337       generate_continuation_stubs();
6338       break;
6339     case Compiler_stubs:
6340       generate_compiler_stubs();
6341       break;
6342     case Final_stubs:
6343       generate_final_stubs();
6344       break;
6345     default:
6346       fatal("unexpected stubs kind: %d", kind);
6347       break;
6348     };
6349   }
6350 }; // end class declaration
6351 
6352 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
6353   StubGenerator g(code, kind);
6354 }