1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/universe.hpp"
  35 #include "nativeInst_riscv.hpp"
  36 #include "oops/instanceOop.hpp"
  37 #include "oops/method.hpp"
  38 #include "oops/objArrayKlass.hpp"
  39 #include "oops/oop.inline.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "prims/upcallLinker.hpp"
  42 #include "runtime/continuation.hpp"
  43 #include "runtime/continuationEntry.inline.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/javaThread.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/runtime.hpp"
  54 #endif
  55 
  56 // Declaration and definition of StubGenerator (no .hpp file).
  57 // For a more detailed description of the stub routine structure
  58 // see the comment in stubRoutines.hpp
  59 
  60 #undef __
  61 #define __ _masm->
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(uint& counter) {
  80     __ incrementw(ExternalAddress((address)&counter));
  81   }
  82 #define inc_counter_np(counter) \
  83   BLOCK_COMMENT("inc_counter " #counter); \
  84   inc_counter_np_(counter);
  85 #endif
  86 
  87   // Call stubs are used to call Java from C
  88   //
  89   // Arguments:
  90   //    c_rarg0:   call wrapper address                   address
  91   //    c_rarg1:   result                                 address
  92   //    c_rarg2:   result type                            BasicType
  93   //    c_rarg3:   method                                 Method*
  94   //    c_rarg4:   (interpreter) entry point              address
  95   //    c_rarg5:   parameters                             intptr_t*
  96   //    c_rarg6:   parameter size (in words)              int
  97   //    c_rarg7:   thread                                 Thread*
  98   //
  99   // There is no return from the stub itself as any Java result
 100   // is written to result
 101   //
 102   // we save x1 (ra) as the return PC at the base of the frame and
 103   // link x8 (fp) below it as the frame pointer installing sp (x2)
 104   // into fp.
 105   //
 106   // we save x10-x17, which accounts for all the c arguments.
 107   //
 108   // TODO: strictly do we need to save them all? they are treated as
 109   // volatile by C so could we omit saving the ones we are going to
 110   // place in global registers (thread? method?) or those we only use
 111   // during setup of the Java call?
 112   //
 113   // we don't need to save x5 which C uses as an indirect result location
 114   // return register.
 115   //
 116   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 117   // volatile
 118   //
 119   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 120   // registers and C expects to be callee-save
 121   //
 122   // so the stub frame looks like this when we enter Java code
 123   //
 124   //     [ return_from_Java     ] <--- sp
 125   //     [ argument word n      ]
 126   //      ...
 127   // -35 [ argument word 1      ]
 128   // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
 129   // -33 [ saved f27            ]
 130   // -32 [ saved f26            ]
 131   // -31 [ saved f25            ]
 132   // -30 [ saved f24            ]
 133   // -29 [ saved f23            ]
 134   // -28 [ saved f22            ]
 135   // -27 [ saved f21            ]
 136   // -26 [ saved f20            ]
 137   // -25 [ saved f19            ]
 138   // -24 [ saved f18            ]
 139   // -23 [ saved f9             ]
 140   // -22 [ saved f8             ]
 141   // -21 [ saved x27            ]
 142   // -20 [ saved x26            ]
 143   // -19 [ saved x25            ]
 144   // -18 [ saved x24            ]
 145   // -17 [ saved x23            ]
 146   // -16 [ saved x22            ]
 147   // -15 [ saved x21            ]
 148   // -14 [ saved x20            ]
 149   // -13 [ saved x19            ]
 150   // -12 [ saved x18            ]
 151   // -11 [ saved x9             ]
 152   // -10 [ call wrapper   (x10) ]
 153   //  -9 [ result         (x11) ]
 154   //  -8 [ result type    (x12) ]
 155   //  -7 [ method         (x13) ]
 156   //  -6 [ entry point    (x14) ]
 157   //  -5 [ parameters     (x15) ]
 158   //  -4 [ parameter size (x16) ]
 159   //  -3 [ thread         (x17) ]
 160   //  -2 [ saved fp       (x8)  ]
 161   //  -1 [ saved ra       (x1)  ]
 162   //   0 [                      ] <--- fp == saved sp (x2)
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off  = -34,
 167 
 168     frm_off            = sp_after_call_off,
 169     f27_off            = -33,
 170     f26_off            = -32,
 171     f25_off            = -31,
 172     f24_off            = -30,
 173     f23_off            = -29,
 174     f22_off            = -28,
 175     f21_off            = -27,
 176     f20_off            = -26,
 177     f19_off            = -25,
 178     f18_off            = -24,
 179     f9_off             = -23,
 180     f8_off             = -22,
 181 
 182     x27_off            = -21,
 183     x26_off            = -20,
 184     x25_off            = -19,
 185     x24_off            = -18,
 186     x23_off            = -17,
 187     x22_off            = -16,
 188     x21_off            = -15,
 189     x20_off            = -14,
 190     x19_off            = -13,
 191     x18_off            = -12,
 192     x9_off             = -11,
 193 
 194     call_wrapper_off   = -10,
 195     result_off         = -9,
 196     result_type_off    = -8,
 197     method_off         = -7,
 198     entry_point_off    = -6,
 199     parameters_off     = -5,
 200     parameter_size_off = -4,
 201     thread_off         = -3,
 202     fp_f               = -2,
 203     retaddr_off        = -1,
 204   };
 205 
 206   address generate_call_stub(address& return_address) {
 207     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 208            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 209            "adjust this code");
 210 
 211     StubCodeMark mark(this, "StubRoutines", "call_stub");
 212     address start = __ pc();
 213 
 214     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 215 
 216     const Address frm_save      (fp, frm_off           * wordSize);
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     __ frrm(t0);
 300     __ sd(t0, frm_save);
 301     // Set frm to the state we need. We do want Round to Nearest. We
 302     // don't want non-IEEE rounding modes.
 303     Label skip_fsrmi;
 304     guarantee(__ RoundingMode::rne == 0, "must be");
 305     __ beqz(t0, skip_fsrmi);
 306     __ fsrmi(__ RoundingMode::rne);
 307     __ bind(skip_fsrmi);
 308 
 309     // install Java thread in global register now we have saved
 310     // whatever value it held
 311     __ mv(xthread, c_rarg7);
 312 
 313     // And method
 314     __ mv(xmethod, c_rarg3);
 315 
 316     // set up the heapbase register
 317     __ reinit_heapbase();
 318 
 319 #ifdef ASSERT
 320     // make sure we have no pending exceptions
 321     {
 322       Label L;
 323       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 324       __ beqz(t0, L);
 325       __ stop("StubRoutines::call_stub: entered with pending exception");
 326       __ BIND(L);
 327     }
 328 #endif
 329     // pass parameters if any
 330     __ mv(esp, sp);
 331     __ slli(t0, c_rarg6, LogBytesPerWord);
 332     __ sub(t0, sp, t0); // Move SP out of the way
 333     __ andi(sp, t0, -2 * wordSize);
 334 
 335     BLOCK_COMMENT("pass parameters if any");
 336     Label parameters_done;
 337     // parameter count is still in c_rarg6
 338     // and parameter pointer identifying param 1 is in c_rarg5
 339     __ beqz(c_rarg6, parameters_done);
 340 
 341     address loop = __ pc();
 342     __ ld(t0, Address(c_rarg5, 0));
 343     __ addi(c_rarg5, c_rarg5, wordSize);
 344     __ addi(c_rarg6, c_rarg6, -1);
 345     __ push_reg(t0);
 346     __ bgtz(c_rarg6, loop);
 347 
 348     __ BIND(parameters_done);
 349 
 350     // call Java entry -- passing methdoOop, and current sp
 351     //      xmethod: Method*
 352     //      x19_sender_sp: sender sp
 353     BLOCK_COMMENT("call Java function");
 354     __ mv(x19_sender_sp, sp);
 355     __ jalr(c_rarg4);
 356 
 357     // save current address for use by exception handling code
 358 
 359     return_address = __ pc();
 360 
 361     // store result depending on type (everything that is not
 362     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 363     // n.b. this assumes Java returns an integral result in x10
 364     // and a floating result in j_farg0
 365     __ ld(j_rarg2, result);
 366     Label is_long, is_float, is_double, exit;
 367     __ ld(j_rarg1, result_type);
 368     __ mv(t0, (u1)T_OBJECT);
 369     __ beq(j_rarg1, t0, is_long);
 370     __ mv(t0, (u1)T_LONG);
 371     __ beq(j_rarg1, t0, is_long);
 372     __ mv(t0, (u1)T_FLOAT);
 373     __ beq(j_rarg1, t0, is_float);
 374     __ mv(t0, (u1)T_DOUBLE);
 375     __ beq(j_rarg1, t0, is_double);
 376 
 377     // handle T_INT case
 378     __ sw(x10, Address(j_rarg2));
 379 
 380     __ BIND(exit);
 381 
 382     // pop parameters
 383     __ addi(esp, fp, sp_after_call_off * wordSize);
 384 
 385 #ifdef ASSERT
 386     // verify that threads correspond
 387     {
 388       Label L, S;
 389       __ ld(t0, thread);
 390       __ bne(xthread, t0, S);
 391       __ get_thread(t0);
 392       __ beq(xthread, t0, L);
 393       __ BIND(S);
 394       __ stop("StubRoutines::call_stub: threads must correspond");
 395       __ BIND(L);
 396     }
 397 #endif
 398 
 399     __ pop_cont_fastpath(xthread);
 400 
 401     // restore callee-save registers
 402     __ fld(f27, f27_save);
 403     __ fld(f26, f26_save);
 404     __ fld(f25, f25_save);
 405     __ fld(f24, f24_save);
 406     __ fld(f23, f23_save);
 407     __ fld(f22, f22_save);
 408     __ fld(f21, f21_save);
 409     __ fld(f20, f20_save);
 410     __ fld(f19, f19_save);
 411     __ fld(f18, f18_save);
 412     __ fld(f9,  f9_save);
 413     __ fld(f8,  f8_save);
 414 
 415     __ ld(x27, x27_save);
 416     __ ld(x26, x26_save);
 417     __ ld(x25, x25_save);
 418     __ ld(x24, x24_save);
 419     __ ld(x23, x23_save);
 420     __ ld(x22, x22_save);
 421     __ ld(x21, x21_save);
 422     __ ld(x20, x20_save);
 423     __ ld(x19, x19_save);
 424     __ ld(x18, x18_save);
 425 
 426     __ ld(x9, x9_save);
 427 
 428     // restore frm
 429     Label skip_fsrm;
 430     __ ld(t0, frm_save);
 431     __ frrm(t1);
 432     __ beq(t0, t1, skip_fsrm);
 433     __ fsrm(t0);
 434     __ bind(skip_fsrm);
 435 
 436     __ ld(c_rarg0, call_wrapper);
 437     __ ld(c_rarg1, result);
 438     __ ld(c_rarg2, result_type);
 439     __ ld(c_rarg3, method);
 440     __ ld(c_rarg4, entry_point);
 441     __ ld(c_rarg5, parameters);
 442     __ ld(c_rarg6, parameter_size);
 443     __ ld(c_rarg7, thread);
 444 
 445     // leave frame and return to caller
 446     __ leave();
 447     __ ret();
 448 
 449     // handle return types different from T_INT
 450 
 451     __ BIND(is_long);
 452     __ sd(x10, Address(j_rarg2, 0));
 453     __ j(exit);
 454 
 455     __ BIND(is_float);
 456     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 457     __ j(exit);
 458 
 459     __ BIND(is_double);
 460     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 461     __ j(exit);
 462 
 463     return start;
 464   }
 465 
 466   // Return point for a Java call if there's an exception thrown in
 467   // Java code.  The exception is caught and transformed into a
 468   // pending exception stored in JavaThread that can be tested from
 469   // within the VM.
 470   //
 471   // Note: Usually the parameters are removed by the callee. In case
 472   // of an exception crossing an activation frame boundary, that is
 473   // not the case if the callee is compiled code => need to setup the
 474   // sp.
 475   //
 476   // x10: exception oop
 477 
 478   address generate_catch_exception() {
 479     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 480     address start = __ pc();
 481 
 482     // same as in generate_call_stub():
 483     const Address thread(fp, thread_off * wordSize);
 484 
 485 #ifdef ASSERT
 486     // verify that threads correspond
 487     {
 488       Label L, S;
 489       __ ld(t0, thread);
 490       __ bne(xthread, t0, S);
 491       __ get_thread(t0);
 492       __ beq(xthread, t0, L);
 493       __ bind(S);
 494       __ stop("StubRoutines::catch_exception: threads must correspond");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // set pending exception
 500     __ verify_oop(x10);
 501 
 502     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 503     __ mv(t0, (address)__FILE__);
 504     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 505     __ mv(t0, (int)__LINE__);
 506     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 507 
 508     // complete return to VM
 509     assert(StubRoutines::_call_stub_return_address != nullptr,
 510            "_call_stub_return_address must have been generated before");
 511     __ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
 512 
 513     return start;
 514   }
 515 
 516   // Continuation point for runtime calls returning with a pending
 517   // exception.  The pending exception check happened in the runtime
 518   // or native call stub.  The pending exception in Thread is
 519   // converted into a Java-level exception.
 520   //
 521   // Contract with Java-level exception handlers:
 522   // x10: exception
 523   // x13: throwing pc
 524   //
 525   // NOTE: At entry of this stub, exception-pc must be in RA !!
 526 
 527   // NOTE: this is always used as a jump target within generated code
 528   // so it just needs to be generated code with no x86 prolog
 529 
 530   address generate_forward_exception() {
 531     StubCodeMark mark(this, "StubRoutines", "forward exception");
 532     address start = __ pc();
 533 
 534     // Upon entry, RA points to the return address returning into
 535     // Java (interpreted or compiled) code; i.e., the return address
 536     // becomes the throwing pc.
 537     //
 538     // Arguments pushed before the runtime call are still on the stack
 539     // but the exception handler will reset the stack pointer ->
 540     // ignore them.  A potential result in registers can be ignored as
 541     // well.
 542 
 543 #ifdef ASSERT
 544     // make sure this code is only executed if there is a pending exception
 545     {
 546       Label L;
 547       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 548       __ bnez(t0, L);
 549       __ stop("StubRoutines::forward exception: no pending exception (1)");
 550       __ bind(L);
 551     }
 552 #endif
 553 
 554     // compute exception handler into x9
 555 
 556     // call the VM to find the handler address associated with the
 557     // caller address. pass thread in x10 and caller pc (ret address)
 558     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 559     // the stack.
 560     __ mv(c_rarg1, ra);
 561     // ra will be trashed by the VM call so we move it to x9
 562     // (callee-saved) because we also need to pass it to the handler
 563     // returned by this call.
 564     __ mv(x9, ra);
 565     BLOCK_COMMENT("call exception_handler_for_return_address");
 566     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 567                          SharedRuntime::exception_handler_for_return_address),
 568                     xthread, c_rarg1);
 569     // we should not really care that ra is no longer the callee
 570     // address. we saved the value the handler needs in x9 so we can
 571     // just copy it to x13. however, the C2 handler will push its own
 572     // frame and then calls into the VM and the VM code asserts that
 573     // the PC for the frame above the handler belongs to a compiled
 574     // Java method. So, we restore ra here to satisfy that assert.
 575     __ mv(ra, x9);
 576     // setup x10 & x13 & clear pending exception
 577     __ mv(x13, x9);
 578     __ mv(x9, x10);
 579     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 580     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 581 
 582 #ifdef ASSERT
 583     // make sure exception is set
 584     {
 585       Label L;
 586       __ bnez(x10, L);
 587       __ stop("StubRoutines::forward exception: no pending exception (2)");
 588       __ bind(L);
 589     }
 590 #endif
 591 
 592     // continue at exception handler
 593     // x10: exception
 594     // x13: throwing pc
 595     // x9: exception handler
 596     __ verify_oop(x10);
 597     __ jr(x9);
 598 
 599     return start;
 600   }
 601 
 602   // Non-destructive plausibility checks for oops
 603   //
 604   // Arguments:
 605   //    x10: oop to verify
 606   //    t0: error message
 607   //
 608   // Stack after saving c_rarg3:
 609   //    [tos + 0]: saved c_rarg3
 610   //    [tos + 1]: saved c_rarg2
 611   //    [tos + 2]: saved ra
 612   //    [tos + 3]: saved t1
 613   //    [tos + 4]: saved x10
 614   //    [tos + 5]: saved t0
 615   address generate_verify_oop() {
 616 
 617     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 618     address start = __ pc();
 619 
 620     Label exit, error;
 621 
 622     __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
 623 
 624     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 625     __ ld(c_rarg3, Address(c_rarg2));
 626     __ add(c_rarg3, c_rarg3, 1);
 627     __ sd(c_rarg3, Address(c_rarg2));
 628 
 629     // object is in x10
 630     // make sure object is 'reasonable'
 631     __ beqz(x10, exit); // if obj is null it is OK
 632 
 633     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 634     bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
 635 
 636     // return if everything seems ok
 637     __ bind(exit);
 638 
 639     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp);  // pop c_rarg2 and c_rarg3
 640     __ ret();
 641 
 642     // handle errors
 643     __ bind(error);
 644     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
 645 
 646     __ push_reg(RegSet::range(x0, x31), sp);
 647     // debug(char* msg, int64_t pc, int64_t regs[])
 648     __ mv(c_rarg0, t0);             // pass address of error message
 649     __ mv(c_rarg1, ra);             // pass return address
 650     __ mv(c_rarg2, sp);             // pass address of regs on stack
 651 #ifndef PRODUCT
 652     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 653 #endif
 654     BLOCK_COMMENT("call MacroAssembler::debug");
 655     __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 656     __ ebreak();
 657 
 658     return start;
 659   }
 660 
 661   // The inner part of zero_words().
 662   //
 663   // Inputs:
 664   // x28: the HeapWord-aligned base address of an array to zero.
 665   // x29: the count in HeapWords, x29 > 0.
 666   //
 667   // Returns x28 and x29, adjusted for the caller to clear.
 668   // x28: the base address of the tail of words left to clear.
 669   // x29: the number of words in the tail.
 670   //      x29 < MacroAssembler::zero_words_block_size.
 671 
 672   address generate_zero_blocks() {
 673     Label done;
 674 
 675     const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
 676 
 677     __ align(CodeEntryAlignment);
 678     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 679     address start = __ pc();
 680 
 681     if (UseBlockZeroing) {
 682       // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero
 683       // after alignment.
 684       Label small;
 685       int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize;
 686       __ mv(tmp1, low_limit);
 687       __ blt(cnt, tmp1, small);
 688       __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
 689       __ bind(small);
 690     }
 691 
 692     {
 693       // Clear the remaining blocks.
 694       Label loop;
 695       __ mv(tmp1, MacroAssembler::zero_words_block_size);
 696       __ blt(cnt, tmp1, done);
 697       __ bind(loop);
 698       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 699         __ sd(zr, Address(base, i * wordSize));
 700       }
 701       __ add(base, base, MacroAssembler::zero_words_block_size * wordSize);
 702       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 703       __ bge(cnt, tmp1, loop);
 704       __ bind(done);
 705     }
 706 
 707     __ ret();
 708 
 709     return start;
 710   }
 711 
 712   typedef enum {
 713     copy_forwards = 1,
 714     copy_backwards = -1
 715   } copy_direction;
 716 
 717   // Bulk copy of blocks of 8 words.
 718   //
 719   // count is a count of words.
 720   //
 721   // Precondition: count >= 8
 722   //
 723   // Postconditions:
 724   //
 725   // The least significant bit of count contains the remaining count
 726   // of words to copy.  The rest of count is trash.
 727   //
 728   // s and d are adjusted to point to the remaining words to copy
 729   //
 730   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 731                            copy_direction direction) {
 732     int unit = wordSize * direction;
 733     int bias = wordSize;
 734 
 735     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 736       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 737 
 738     const Register stride = x30;
 739 
 740     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 741       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 742     assert_different_registers(s, d, count, t0);
 743 
 744     Label again, drain;
 745     const char* stub_name = nullptr;
 746     if (direction == copy_forwards) {
 747       stub_name = "forward_copy_longs";
 748     } else {
 749       stub_name = "backward_copy_longs";
 750     }
 751     StubCodeMark mark(this, "StubRoutines", stub_name);
 752     __ align(CodeEntryAlignment);
 753     __ bind(start);
 754 
 755     if (direction == copy_forwards) {
 756       __ sub(s, s, bias);
 757       __ sub(d, d, bias);
 758     }
 759 
 760 #ifdef ASSERT
 761     // Make sure we are never given < 8 words
 762     {
 763       Label L;
 764 
 765       __ mv(t0, 8);
 766       __ bge(count, t0, L);
 767       __ stop("genrate_copy_longs called with < 8 words");
 768       __ bind(L);
 769     }
 770 #endif
 771 
 772     __ ld(tmp_reg0, Address(s, 1 * unit));
 773     __ ld(tmp_reg1, Address(s, 2 * unit));
 774     __ ld(tmp_reg2, Address(s, 3 * unit));
 775     __ ld(tmp_reg3, Address(s, 4 * unit));
 776     __ ld(tmp_reg4, Address(s, 5 * unit));
 777     __ ld(tmp_reg5, Address(s, 6 * unit));
 778     __ ld(tmp_reg6, Address(s, 7 * unit));
 779     __ ld(tmp_reg7, Address(s, 8 * unit));
 780     __ addi(s, s, 8 * unit);
 781 
 782     __ sub(count, count, 16);
 783     __ bltz(count, drain);
 784 
 785     __ bind(again);
 786 
 787     __ sd(tmp_reg0, Address(d, 1 * unit));
 788     __ sd(tmp_reg1, Address(d, 2 * unit));
 789     __ sd(tmp_reg2, Address(d, 3 * unit));
 790     __ sd(tmp_reg3, Address(d, 4 * unit));
 791     __ sd(tmp_reg4, Address(d, 5 * unit));
 792     __ sd(tmp_reg5, Address(d, 6 * unit));
 793     __ sd(tmp_reg6, Address(d, 7 * unit));
 794     __ sd(tmp_reg7, Address(d, 8 * unit));
 795 
 796     __ ld(tmp_reg0, Address(s, 1 * unit));
 797     __ ld(tmp_reg1, Address(s, 2 * unit));
 798     __ ld(tmp_reg2, Address(s, 3 * unit));
 799     __ ld(tmp_reg3, Address(s, 4 * unit));
 800     __ ld(tmp_reg4, Address(s, 5 * unit));
 801     __ ld(tmp_reg5, Address(s, 6 * unit));
 802     __ ld(tmp_reg6, Address(s, 7 * unit));
 803     __ ld(tmp_reg7, Address(s, 8 * unit));
 804 
 805     __ addi(s, s, 8 * unit);
 806     __ addi(d, d, 8 * unit);
 807 
 808     __ sub(count, count, 8);
 809     __ bgez(count, again);
 810 
 811     // Drain
 812     __ bind(drain);
 813 
 814     __ sd(tmp_reg0, Address(d, 1 * unit));
 815     __ sd(tmp_reg1, Address(d, 2 * unit));
 816     __ sd(tmp_reg2, Address(d, 3 * unit));
 817     __ sd(tmp_reg3, Address(d, 4 * unit));
 818     __ sd(tmp_reg4, Address(d, 5 * unit));
 819     __ sd(tmp_reg5, Address(d, 6 * unit));
 820     __ sd(tmp_reg6, Address(d, 7 * unit));
 821     __ sd(tmp_reg7, Address(d, 8 * unit));
 822     __ addi(d, d, 8 * unit);
 823 
 824     {
 825       Label L1, L2;
 826       __ test_bit(t0, count, 2);
 827       __ beqz(t0, L1);
 828 
 829       __ ld(tmp_reg0, Address(s, 1 * unit));
 830       __ ld(tmp_reg1, Address(s, 2 * unit));
 831       __ ld(tmp_reg2, Address(s, 3 * unit));
 832       __ ld(tmp_reg3, Address(s, 4 * unit));
 833       __ addi(s, s, 4 * unit);
 834 
 835       __ sd(tmp_reg0, Address(d, 1 * unit));
 836       __ sd(tmp_reg1, Address(d, 2 * unit));
 837       __ sd(tmp_reg2, Address(d, 3 * unit));
 838       __ sd(tmp_reg3, Address(d, 4 * unit));
 839       __ addi(d, d, 4 * unit);
 840 
 841       __ bind(L1);
 842 
 843       if (direction == copy_forwards) {
 844         __ addi(s, s, bias);
 845         __ addi(d, d, bias);
 846       }
 847 
 848       __ test_bit(t0, count, 1);
 849       __ beqz(t0, L2);
 850       if (direction == copy_backwards) {
 851         __ addi(s, s, 2 * unit);
 852         __ ld(tmp_reg0, Address(s));
 853         __ ld(tmp_reg1, Address(s, wordSize));
 854         __ addi(d, d, 2 * unit);
 855         __ sd(tmp_reg0, Address(d));
 856         __ sd(tmp_reg1, Address(d, wordSize));
 857       } else {
 858         __ ld(tmp_reg0, Address(s));
 859         __ ld(tmp_reg1, Address(s, wordSize));
 860         __ addi(s, s, 2 * unit);
 861         __ sd(tmp_reg0, Address(d));
 862         __ sd(tmp_reg1, Address(d, wordSize));
 863         __ addi(d, d, 2 * unit);
 864       }
 865       __ bind(L2);
 866     }
 867 
 868     __ ret();
 869   }
 870 
 871   Label copy_f, copy_b;
 872 
 873   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 874 
 875   void copy_memory_v(Register s, Register d, Register count, int step) {
 876     bool is_backward = step < 0;
 877     int granularity = uabs(step);
 878 
 879     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 880     assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
 881     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 882     Label loop_forward, loop_backward, done;
 883 
 884     __ mv(dst, d);
 885     __ mv(src, s);
 886     __ mv(cnt, count);
 887 
 888     __ bind(loop_forward);
 889     __ vsetvli(vl, cnt, sew, Assembler::m8);
 890     if (is_backward) {
 891       __ bne(vl, cnt, loop_backward);
 892     }
 893 
 894     __ vlex_v(v0, src, sew);
 895     __ sub(cnt, cnt, vl);
 896     if (sew != Assembler::e8) {
 897       // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 898       __ slli(vl, vl, sew);
 899     }
 900     __ add(src, src, vl);
 901 
 902     __ vsex_v(v0, dst, sew);
 903     __ add(dst, dst, vl);
 904     __ bnez(cnt, loop_forward);
 905 
 906     if (is_backward) {
 907       __ j(done);
 908 
 909       __ bind(loop_backward);
 910       __ sub(t0, cnt, vl);
 911       if (sew != Assembler::e8) {
 912         // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 913         __ slli(t0, t0, sew);
 914       }
 915       __ add(tmp1, s, t0);
 916       __ vlex_v(v0, tmp1, sew);
 917       __ add(tmp2, d, t0);
 918       __ vsex_v(v0, tmp2, sew);
 919       __ sub(cnt, cnt, vl);
 920       __ bnez(cnt, loop_forward);
 921       __ bind(done);
 922     }
 923   }
 924 
 925   // All-singing all-dancing memory copy.
 926   //
 927   // Copy count units of memory from s to d.  The size of a unit is
 928   // step, which can be positive or negative depending on the direction
 929   // of copy.
 930   //
 931   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 932                    Register s, Register d, Register count, int step) {
 933     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 934     if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
 935       return copy_memory_v(s, d, count, step);
 936     }
 937 
 938     bool is_backwards = step < 0;
 939     int granularity = uabs(step);
 940 
 941     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 942     const Register gct1 = x28, gct2 = x29, gct3 = t2;
 943 
 944     Label same_aligned;
 945     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 946 
 947     // The size of copy32_loop body increases significantly with ZGC GC barriers.
 948     // Need conditional far branches to reach a point beyond the loop in this case.
 949     bool is_far = UseZGC;
 950 
 951     __ beqz(count, done, is_far);
 952     __ slli(cnt, count, exact_log2(granularity));
 953     if (is_backwards) {
 954       __ add(src, s, cnt);
 955       __ add(dst, d, cnt);
 956     } else {
 957       __ mv(src, s);
 958       __ mv(dst, d);
 959     }
 960 
 961     if (is_aligned) {
 962       __ addi(t0, cnt, -32);
 963       __ bgez(t0, copy32_loop);
 964       __ addi(t0, cnt, -8);
 965       __ bgez(t0, copy8_loop, is_far);
 966       __ j(copy_small);
 967     } else {
 968       __ mv(t0, 16);
 969       __ blt(cnt, t0, copy_small, is_far);
 970 
 971       __ xorr(t0, src, dst);
 972       __ andi(t0, t0, 0b111);
 973       __ bnez(t0, copy_small, is_far);
 974 
 975       __ bind(same_aligned);
 976       __ andi(t0, src, 0b111);
 977       __ beqz(t0, copy_big);
 978       if (is_backwards) {
 979         __ addi(src, src, step);
 980         __ addi(dst, dst, step);
 981       }
 982       bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
 983       bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
 984       if (!is_backwards) {
 985         __ addi(src, src, step);
 986         __ addi(dst, dst, step);
 987       }
 988       __ addi(cnt, cnt, -granularity);
 989       __ beqz(cnt, done, is_far);
 990       __ j(same_aligned);
 991 
 992       __ bind(copy_big);
 993       __ mv(t0, 32);
 994       __ blt(cnt, t0, copy8_loop, is_far);
 995     }
 996 
 997     __ bind(copy32_loop);
 998     if (is_backwards) {
 999       __ addi(src, src, -wordSize * 4);
1000       __ addi(dst, dst, -wordSize * 4);
1001     }
1002     // we first load 32 bytes, then write it, so the direction here doesn't matter
1003     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src),     gct1);
1004     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8),  gct1);
1005     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1006     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1007 
1008     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst),     tmp3, gct1, gct2, gct3);
1009     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8),  tmp4, gct1, gct2, gct3);
1010     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1011     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1012 
1013     if (!is_backwards) {
1014       __ addi(src, src, wordSize * 4);
1015       __ addi(dst, dst, wordSize * 4);
1016     }
1017     __ addi(t0, cnt, -(32 + wordSize * 4));
1018     __ addi(cnt, cnt, -wordSize * 4);
1019     __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1020 
1021     __ beqz(cnt, done); // if that's all - done
1022 
1023     __ addi(t0, cnt, -8); // if not - copy the reminder
1024     __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1025 
1026     __ bind(copy8_loop);
1027     if (is_backwards) {
1028       __ addi(src, src, -wordSize);
1029       __ addi(dst, dst, -wordSize);
1030     }
1031     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1032     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1033 
1034     if (!is_backwards) {
1035       __ addi(src, src, wordSize);
1036       __ addi(dst, dst, wordSize);
1037     }
1038     __ addi(t0, cnt, -(8 + wordSize));
1039     __ addi(cnt, cnt, -wordSize);
1040     __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1041 
1042     __ beqz(cnt, done); // if that's all - done
1043 
1044     __ bind(copy_small);
1045     if (is_backwards) {
1046       __ addi(src, src, step);
1047       __ addi(dst, dst, step);
1048     }
1049 
1050     bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1051     bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1052 
1053     if (!is_backwards) {
1054       __ addi(src, src, step);
1055       __ addi(dst, dst, step);
1056     }
1057     __ addi(cnt, cnt, -granularity);
1058     __ bgtz(cnt, copy_small);
1059 
1060     __ bind(done);
1061   }
1062 
1063   // Scan over array at a for count oops, verifying each one.
1064   // Preserves a and count, clobbers t0 and t1.
1065   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1066     Label loop, end;
1067     __ mv(t1, zr);
1068     __ slli(t0, count, exact_log2(size));
1069     __ bind(loop);
1070     __ bgeu(t1, t0, end);
1071 
1072     __ add(temp, a, t1);
1073     if (size == (size_t)wordSize) {
1074       __ ld(temp, Address(temp, 0));
1075       __ verify_oop(temp);
1076     } else {
1077       __ lwu(temp, Address(temp, 0));
1078       __ decode_heap_oop(temp); // calls verify_oop
1079     }
1080     __ add(t1, t1, size);
1081     __ j(loop);
1082     __ bind(end);
1083   }
1084 
1085   // Arguments:
1086   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1087   //             ignored
1088   //   is_oop  - true => oop array, so generate store check code
1089   //   name    - stub name string
1090   //
1091   // Inputs:
1092   //   c_rarg0   - source array address
1093   //   c_rarg1   - destination array address
1094   //   c_rarg2   - element count, treated as ssize_t, can be zero
1095   //
1096   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1097   // the hardware handle it.  The two dwords within qwords that span
1098   // cache line boundaries will still be loaded and stored atomically.
1099   //
1100   // Side Effects:
1101   //   disjoint_int_copy_entry is set to the no-overlap entry point
1102   //   used by generate_conjoint_int_oop_copy().
1103   //
1104   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry,
1105                                  const char* name, bool dest_uninitialized = false) {
1106     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1107     RegSet saved_reg = RegSet::of(s, d, count);
1108     __ align(CodeEntryAlignment);
1109     StubCodeMark mark(this, "StubRoutines", name);
1110     address start = __ pc();
1111     __ enter();
1112 
1113     if (entry != nullptr) {
1114       *entry = __ pc();
1115       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1116       BLOCK_COMMENT("Entry:");
1117     }
1118 
1119     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1120     if (dest_uninitialized) {
1121       decorators |= IS_DEST_UNINITIALIZED;
1122     }
1123     if (aligned) {
1124       decorators |= ARRAYCOPY_ALIGNED;
1125     }
1126 
1127     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1128     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1129 
1130     if (is_oop) {
1131       // save regs before copy_memory
1132       __ push_reg(RegSet::of(d, count), sp);
1133     }
1134 
1135     {
1136       // UnsafeMemoryAccess page error: continue after unsafe access
1137       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1138       UnsafeMemoryAccessMark umam(this, add_entry, true);
1139       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1140     }
1141 
1142     if (is_oop) {
1143       __ pop_reg(RegSet::of(d, count), sp);
1144       if (VerifyOops) {
1145         verify_oop_array(size, d, count, t2);
1146       }
1147     }
1148 
1149     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1150 
1151     __ leave();
1152     __ mv(x10, zr); // return 0
1153     __ ret();
1154     return start;
1155   }
1156 
1157   // Arguments:
1158   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1159   //             ignored
1160   //   is_oop  - true => oop array, so generate store check code
1161   //   name    - stub name string
1162   //
1163   // Inputs:
1164   //   c_rarg0   - source array address
1165   //   c_rarg1   - destination array address
1166   //   c_rarg2   - element count, treated as ssize_t, can be zero
1167   //
1168   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1169   // the hardware handle it.  The two dwords within qwords that span
1170   // cache line boundaries will still be loaded and stored atomically.
1171   //
1172   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1173                                  address* entry, const char* name,
1174                                  bool dest_uninitialized = false) {
1175     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1176     RegSet saved_regs = RegSet::of(s, d, count);
1177     StubCodeMark mark(this, "StubRoutines", name);
1178     address start = __ pc();
1179     __ enter();
1180 
1181     if (entry != nullptr) {
1182       *entry = __ pc();
1183       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1184       BLOCK_COMMENT("Entry:");
1185     }
1186 
1187     // use fwd copy when (d-s) above_equal (count*size)
1188     __ sub(t0, d, s);
1189     __ slli(t1, count, exact_log2(size));
1190     Label L_continue;
1191     __ bltu(t0, t1, L_continue);
1192     __ j(nooverlap_target);
1193     __ bind(L_continue);
1194 
1195     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1196     if (dest_uninitialized) {
1197       decorators |= IS_DEST_UNINITIALIZED;
1198     }
1199     if (aligned) {
1200       decorators |= ARRAYCOPY_ALIGNED;
1201     }
1202 
1203     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1204     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1205 
1206     if (is_oop) {
1207       // save regs before copy_memory
1208       __ push_reg(RegSet::of(d, count), sp);
1209     }
1210 
1211     {
1212       // UnsafeMemoryAccess page error: continue after unsafe access
1213       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1214       UnsafeMemoryAccessMark umam(this, add_entry, true);
1215       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1216     }
1217 
1218     if (is_oop) {
1219       __ pop_reg(RegSet::of(d, count), sp);
1220       if (VerifyOops) {
1221         verify_oop_array(size, d, count, t2);
1222       }
1223     }
1224     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1225     __ leave();
1226     __ mv(x10, zr); // return 0
1227     __ ret();
1228     return start;
1229   }
1230 
1231   // Arguments:
1232   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1233   //             ignored
1234   //   name    - stub name string
1235   //
1236   // Inputs:
1237   //   c_rarg0   - source array address
1238   //   c_rarg1   - destination array address
1239   //   c_rarg2   - element count, treated as ssize_t, can be zero
1240   //
1241   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1242   // we let the hardware handle it.  The one to eight bytes within words,
1243   // dwords or qwords that span cache line boundaries will still be loaded
1244   // and stored atomically.
1245   //
1246   // Side Effects:
1247   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1248   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1249   // we let the hardware handle it.  The one to eight bytes within words,
1250   // dwords or qwords that span cache line boundaries will still be loaded
1251   // and stored atomically.
1252   //
1253   // Side Effects:
1254   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1255   //   used by generate_conjoint_byte_copy().
1256   //
1257   address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) {
1258     const bool not_oop = false;
1259     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1260   }
1261 
1262   // Arguments:
1263   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1264   //             ignored
1265   //   name    - stub name string
1266   //
1267   // Inputs:
1268   //   c_rarg0   - source array address
1269   //   c_rarg1   - destination array address
1270   //   c_rarg2   - element count, treated as ssize_t, can be zero
1271   //
1272   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1273   // we let the hardware handle it.  The one to eight bytes within words,
1274   // dwords or qwords that span cache line boundaries will still be loaded
1275   // and stored atomically.
1276   //
1277   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1278                                       address* entry, const char* name) {
1279     const bool not_oop = false;
1280     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1281   }
1282 
1283   // Arguments:
1284   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1285   //             ignored
1286   //   name    - stub name string
1287   //
1288   // Inputs:
1289   //   c_rarg0   - source array address
1290   //   c_rarg1   - destination array address
1291   //   c_rarg2   - element count, treated as ssize_t, can be zero
1292   //
1293   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1294   // let the hardware handle it.  The two or four words within dwords
1295   // or qwords that span cache line boundaries will still be loaded
1296   // and stored atomically.
1297   //
1298   // Side Effects:
1299   //   disjoint_short_copy_entry is set to the no-overlap entry point
1300   //   used by generate_conjoint_short_copy().
1301   //
1302   address generate_disjoint_short_copy(bool aligned,
1303                                        address* entry, const char* name) {
1304     const bool not_oop = false;
1305     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1306   }
1307 
1308   // Arguments:
1309   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1310   //             ignored
1311   //   name    - stub name string
1312   //
1313   // Inputs:
1314   //   c_rarg0   - source array address
1315   //   c_rarg1   - destination array address
1316   //   c_rarg2   - element count, treated as ssize_t, can be zero
1317   //
1318   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1319   // let the hardware handle it.  The two or four words within dwords
1320   // or qwords that span cache line boundaries will still be loaded
1321   // and stored atomically.
1322   //
1323   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1324                                        address* entry, const char* name) {
1325     const bool not_oop = false;
1326     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1327   }
1328 
1329   // Arguments:
1330   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1331   //             ignored
1332   //   name    - stub name string
1333   //
1334   // Inputs:
1335   //   c_rarg0   - source array address
1336   //   c_rarg1   - destination array address
1337   //   c_rarg2   - element count, treated as ssize_t, can be zero
1338   //
1339   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1340   // the hardware handle it.  The two dwords within qwords that span
1341   // cache line boundaries will still be loaded and stored atomically.
1342   //
1343   // Side Effects:
1344   //   disjoint_int_copy_entry is set to the no-overlap entry point
1345   //   used by generate_conjoint_int_oop_copy().
1346   //
1347   address generate_disjoint_int_copy(bool aligned, address* entry,
1348                                      const char* name, bool dest_uninitialized = false) {
1349     const bool not_oop = false;
1350     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1351   }
1352 
1353   // Arguments:
1354   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1355   //             ignored
1356   //   name    - stub name string
1357   //
1358   // Inputs:
1359   //   c_rarg0   - source array address
1360   //   c_rarg1   - destination array address
1361   //   c_rarg2   - element count, treated as ssize_t, can be zero
1362   //
1363   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1364   // the hardware handle it.  The two dwords within qwords that span
1365   // cache line boundaries will still be loaded and stored atomically.
1366   //
1367   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1368                                      address* entry, const char* name,
1369                                      bool dest_uninitialized = false) {
1370     const bool not_oop = false;
1371     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1372   }
1373 
1374 
1375   // Arguments:
1376   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1377   //             ignored
1378   //   name    - stub name string
1379   //
1380   // Inputs:
1381   //   c_rarg0   - source array address
1382   //   c_rarg1   - destination array address
1383   //   c_rarg2   - element count, treated as size_t, can be zero
1384   //
1385   // Side Effects:
1386   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1387   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1388   //
1389   address generate_disjoint_long_copy(bool aligned, address* entry,
1390                                       const char* name, bool dest_uninitialized = false) {
1391     const bool not_oop = false;
1392     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1393   }
1394 
1395   // Arguments:
1396   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1397   //             ignored
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as size_t, can be zero
1404   //
1405   address generate_conjoint_long_copy(bool aligned,
1406                                       address nooverlap_target, address* entry,
1407                                       const char* name, bool dest_uninitialized = false) {
1408     const bool not_oop = false;
1409     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1410   }
1411 
1412   // Arguments:
1413   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1414   //             ignored
1415   //   name    - stub name string
1416   //
1417   // Inputs:
1418   //   c_rarg0   - source array address
1419   //   c_rarg1   - destination array address
1420   //   c_rarg2   - element count, treated as size_t, can be zero
1421   //
1422   // Side Effects:
1423   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1424   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1425   //
1426   address generate_disjoint_oop_copy(bool aligned, address* entry,
1427                                      const char* name, bool dest_uninitialized) {
1428     const bool is_oop = true;
1429     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1430     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1431   }
1432 
1433   // Arguments:
1434   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1435   //             ignored
1436   //   name    - stub name string
1437   //
1438   // Inputs:
1439   //   c_rarg0   - source array address
1440   //   c_rarg1   - destination array address
1441   //   c_rarg2   - element count, treated as size_t, can be zero
1442   //
1443   address generate_conjoint_oop_copy(bool aligned,
1444                                      address nooverlap_target, address* entry,
1445                                      const char* name, bool dest_uninitialized) {
1446     const bool is_oop = true;
1447     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1448     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1449                                   name, dest_uninitialized);
1450   }
1451 
1452   // Helper for generating a dynamic type check.
1453   // Smashes t0, t1.
1454   void generate_type_check(Register sub_klass,
1455                            Register super_check_offset,
1456                            Register super_klass,
1457                            Label& L_success) {
1458     assert_different_registers(sub_klass, super_check_offset, super_klass);
1459 
1460     BLOCK_COMMENT("type_check:");
1461 
1462     Label L_miss;
1463 
1464     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1465     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1466 
1467     // Fall through on failure!
1468     __ BIND(L_miss);
1469   }
1470 
1471   //
1472   //  Generate checkcasting array copy stub
1473   //
1474   //  Input:
1475   //    c_rarg0   - source array address
1476   //    c_rarg1   - destination array address
1477   //    c_rarg2   - element count, treated as ssize_t, can be zero
1478   //    c_rarg3   - size_t ckoff (super_check_offset)
1479   //    c_rarg4   - oop ckval (super_klass)
1480   //
1481   //  Output:
1482   //    x10 ==  0  -  success
1483   //    x10 == -1^K - failure, where K is partial transfer count
1484   //
1485   address generate_checkcast_copy(const char* name, address* entry,
1486                                   bool dest_uninitialized = false) {
1487     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1488 
1489     // Input registers (after setup_arg_regs)
1490     const Register from        = c_rarg0;   // source array address
1491     const Register to          = c_rarg1;   // destination array address
1492     const Register count       = c_rarg2;   // elementscount
1493     const Register ckoff       = c_rarg3;   // super_check_offset
1494     const Register ckval       = c_rarg4;   // super_klass
1495 
1496     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1497     RegSet wb_post_saved_regs  = RegSet::of(count);
1498 
1499     // Registers used as temps (x7, x9, x18 are save-on-entry)
1500     const Register count_save  = x19;       // orig elementscount
1501     const Register start_to    = x18;       // destination array start address
1502     const Register copied_oop  = x7;        // actual oop copied
1503     const Register r9_klass    = x9;        // oop._klass
1504 
1505     // Registers used as gc temps (x15, x16, x17 are save-on-call)
1506     const Register gct1 = x15, gct2 = x16, gct3 = x17;
1507 
1508     //---------------------------------------------------------------
1509     // Assembler stub will be used for this call to arraycopy
1510     // if the two arrays are subtypes of Object[] but the
1511     // destination array type is not equal to or a supertype
1512     // of the source type.  Each element must be separately
1513     // checked.
1514 
1515     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1516                                copied_oop, r9_klass, count_save);
1517 
1518     __ align(CodeEntryAlignment);
1519     StubCodeMark mark(this, "StubRoutines", name);
1520     address start = __ pc();
1521 
1522     __ enter(); // required for proper stackwalking of RuntimeStub frame
1523 
1524     // Caller of this entry point must set up the argument registers.
1525     if (entry != nullptr) {
1526       *entry = __ pc();
1527       BLOCK_COMMENT("Entry:");
1528     }
1529 
1530     // Empty array:  Nothing to do
1531     __ beqz(count, L_done);
1532 
1533     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1534 
1535 #ifdef ASSERT
1536     BLOCK_COMMENT("assert consistent ckoff/ckval");
1537     // The ckoff and ckval must be mutually consistent,
1538     // even though caller generates both.
1539     { Label L;
1540       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1541       __ lwu(start_to, Address(ckval, sco_offset));
1542       __ beq(ckoff, start_to, L);
1543       __ stop("super_check_offset inconsistent");
1544       __ bind(L);
1545     }
1546 #endif //ASSERT
1547 
1548     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1549     if (dest_uninitialized) {
1550       decorators |= IS_DEST_UNINITIALIZED;
1551     }
1552 
1553     bool is_oop = true;
1554     int element_size = UseCompressedOops ? 4 : 8;
1555 
1556     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1557     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1558 
1559     // save the original count
1560     __ mv(count_save, count);
1561 
1562     // Copy from low to high addresses
1563     __ mv(start_to, to);              // Save destination array start address
1564     __ j(L_load_element);
1565 
1566     // ======== begin loop ========
1567     // (Loop is rotated; its entry is L_load_element.)
1568     // Loop control:
1569     //   for count to 0 do
1570     //     copied_oop = load_heap_oop(from++)
1571     //     ... generate_type_check ...
1572     //     store_heap_oop(to++, copied_oop)
1573     //   end
1574 
1575     __ align(OptoLoopAlignment);
1576 
1577     __ BIND(L_store_element);
1578     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1579                       Address(to, 0), copied_oop,
1580                       gct1, gct2, gct3);
1581     __ add(to, to, UseCompressedOops ? 4 : 8);
1582     __ sub(count, count, 1);
1583     __ beqz(count, L_do_card_marks);
1584 
1585     // ======== loop entry is here ========
1586     __ BIND(L_load_element);
1587     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1588                      copied_oop, Address(from, 0),
1589                      gct1);
1590     __ add(from, from, UseCompressedOops ? 4 : 8);
1591     __ beqz(copied_oop, L_store_element);
1592 
1593     __ load_klass(r9_klass, copied_oop);// query the object klass
1594     generate_type_check(r9_klass, ckoff, ckval, L_store_element);
1595     // ======== end loop ========
1596 
1597     // It was a real error; we must depend on the caller to finish the job.
1598     // Register count = remaining oops, count_orig = total oops.
1599     // Emit GC store barriers for the oops we have copied and report
1600     // their number to the caller.
1601 
1602     __ sub(count, count_save, count);     // K = partially copied oop count
1603     __ xori(count, count, -1);                   // report (-1^K) to caller
1604     __ beqz(count, L_done_pop);
1605 
1606     __ BIND(L_do_card_marks);
1607     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
1608 
1609     __ bind(L_done_pop);
1610     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1611     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1612 
1613     __ bind(L_done);
1614     __ mv(x10, count);
1615     __ leave();
1616     __ ret();
1617 
1618     return start;
1619   }
1620 
1621   // Perform range checks on the proposed arraycopy.
1622   // Kills temp, but nothing else.
1623   // Also, clean the sign bits of src_pos and dst_pos.
1624   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1625                               Register src_pos, // source position (c_rarg1)
1626                               Register dst,     // destination array oo (c_rarg2)
1627                               Register dst_pos, // destination position (c_rarg3)
1628                               Register length,
1629                               Register temp,
1630                               Label& L_failed) {
1631     BLOCK_COMMENT("arraycopy_range_checks:");
1632 
1633     assert_different_registers(t0, temp);
1634 
1635     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1636     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1637     __ addw(temp, length, src_pos);
1638     __ bgtu(temp, t0, L_failed);
1639 
1640     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1641     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1642     __ addw(temp, length, dst_pos);
1643     __ bgtu(temp, t0, L_failed);
1644 
1645     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1646     __ zero_extend(src_pos, src_pos, 32);
1647     __ zero_extend(dst_pos, dst_pos, 32);
1648 
1649     BLOCK_COMMENT("arraycopy_range_checks done");
1650   }
1651 
1652   //
1653   //  Generate 'unsafe' array copy stub
1654   //  Though just as safe as the other stubs, it takes an unscaled
1655   //  size_t argument instead of an element count.
1656   //
1657   //  Input:
1658   //    c_rarg0   - source array address
1659   //    c_rarg1   - destination array address
1660   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1661   //
1662   // Examines the alignment of the operands and dispatches
1663   // to a long, int, short, or byte copy loop.
1664   //
1665   address generate_unsafe_copy(const char* name,
1666                                address byte_copy_entry,
1667                                address short_copy_entry,
1668                                address int_copy_entry,
1669                                address long_copy_entry) {
1670     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1671                 int_copy_entry != nullptr && long_copy_entry != nullptr);
1672     Label L_long_aligned, L_int_aligned, L_short_aligned;
1673     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1674 
1675     __ align(CodeEntryAlignment);
1676     StubCodeMark mark(this, "StubRoutines", name);
1677     address start = __ pc();
1678     __ enter(); // required for proper stackwalking of RuntimeStub frame
1679 
1680     // bump this on entry, not on exit:
1681     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1682 
1683     __ orr(t0, s, d);
1684     __ orr(t0, t0, count);
1685 
1686     __ andi(t0, t0, BytesPerLong - 1);
1687     __ beqz(t0, L_long_aligned);
1688     __ andi(t0, t0, BytesPerInt - 1);
1689     __ beqz(t0, L_int_aligned);
1690     __ test_bit(t0, t0, 0);
1691     __ beqz(t0, L_short_aligned);
1692     __ j(RuntimeAddress(byte_copy_entry));
1693 
1694     __ BIND(L_short_aligned);
1695     __ srli(count, count, LogBytesPerShort);  // size => short_count
1696     __ j(RuntimeAddress(short_copy_entry));
1697     __ BIND(L_int_aligned);
1698     __ srli(count, count, LogBytesPerInt);    // size => int_count
1699     __ j(RuntimeAddress(int_copy_entry));
1700     __ BIND(L_long_aligned);
1701     __ srli(count, count, LogBytesPerLong);   // size => long_count
1702     __ j(RuntimeAddress(long_copy_entry));
1703 
1704     return start;
1705   }
1706 
1707   //
1708   //  Generate generic array copy stubs
1709   //
1710   //  Input:
1711   //    c_rarg0    -  src oop
1712   //    c_rarg1    -  src_pos (32-bits)
1713   //    c_rarg2    -  dst oop
1714   //    c_rarg3    -  dst_pos (32-bits)
1715   //    c_rarg4    -  element count (32-bits)
1716   //
1717   //  Output:
1718   //    x10 ==  0  -  success
1719   //    x10 == -1^K - failure, where K is partial transfer count
1720   //
1721   address generate_generic_copy(const char* name,
1722                                 address byte_copy_entry, address short_copy_entry,
1723                                 address int_copy_entry, address oop_copy_entry,
1724                                 address long_copy_entry, address checkcast_copy_entry) {
1725     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1726                 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1727                 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1728     Label L_failed, L_failed_0, L_objArray;
1729     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1730 
1731     // Input registers
1732     const Register src        = c_rarg0;  // source array oop
1733     const Register src_pos    = c_rarg1;  // source position
1734     const Register dst        = c_rarg2;  // destination array oop
1735     const Register dst_pos    = c_rarg3;  // destination position
1736     const Register length     = c_rarg4;
1737 
1738     // Registers used as temps
1739     const Register dst_klass = c_rarg5;
1740 
1741     __ align(CodeEntryAlignment);
1742 
1743     StubCodeMark mark(this, "StubRoutines", name);
1744 
1745     address start = __ pc();
1746 
1747     __ enter(); // required for proper stackwalking of RuntimeStub frame
1748 
1749     // bump this on entry, not on exit:
1750     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1751 
1752     //-----------------------------------------------------------------------
1753     // Assembler stub will be used for this call to arraycopy
1754     // if the following conditions are met:
1755     //
1756     // (1) src and dst must not be null.
1757     // (2) src_pos must not be negative.
1758     // (3) dst_pos must not be negative.
1759     // (4) length  must not be negative.
1760     // (5) src klass and dst klass should be the same and not null.
1761     // (6) src and dst should be arrays.
1762     // (7) src_pos + length must not exceed length of src.
1763     // (8) dst_pos + length must not exceed length of dst.
1764     //
1765 
1766     // if src is null then return -1
1767     __ beqz(src, L_failed);
1768 
1769     // if [src_pos < 0] then return -1
1770     __ sign_extend(t0, src_pos, 32);
1771     __ bltz(t0, L_failed);
1772 
1773     // if dst is null then return -1
1774     __ beqz(dst, L_failed);
1775 
1776     // if [dst_pos < 0] then return -1
1777     __ sign_extend(t0, dst_pos, 32);
1778     __ bltz(t0, L_failed);
1779 
1780     // registers used as temp
1781     const Register scratch_length    = x28; // elements count to copy
1782     const Register scratch_src_klass = x29; // array klass
1783     const Register lh                = x30; // layout helper
1784 
1785     // if [length < 0] then return -1
1786     __ sign_extend(scratch_length, length, 32);    // length (elements count, 32-bits value)
1787     __ bltz(scratch_length, L_failed);
1788 
1789     __ load_klass(scratch_src_klass, src);
1790 #ifdef ASSERT
1791     {
1792       BLOCK_COMMENT("assert klasses not null {");
1793       Label L1, L2;
1794       __ bnez(scratch_src_klass, L2);   // it is broken if klass is null
1795       __ bind(L1);
1796       __ stop("broken null klass");
1797       __ bind(L2);
1798       __ load_klass(t0, dst, t1);
1799       __ beqz(t0, L1);     // this would be broken also
1800       BLOCK_COMMENT("} assert klasses not null done");
1801     }
1802 #endif
1803 
1804     // Load layout helper (32-bits)
1805     //
1806     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1807     // 32        30    24            16              8     2                 0
1808     //
1809     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1810     //
1811 
1812     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1813 
1814     // Handle objArrays completely differently...
1815     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1816     __ lw(lh, Address(scratch_src_klass, lh_offset));
1817     __ mv(t0, objArray_lh);
1818     __ beq(lh, t0, L_objArray);
1819 
1820     // if [src->klass() != dst->klass()] then return -1
1821     __ load_klass(t1, dst);
1822     __ bne(t1, scratch_src_klass, L_failed);
1823 
1824     // if src->is_Array() isn't null then return -1
1825     // i.e. (lh >= 0)
1826     __ bgez(lh, L_failed);
1827 
1828     // At this point, it is known to be a typeArray (array_tag 0x3).
1829 #ifdef ASSERT
1830     {
1831       BLOCK_COMMENT("assert primitive array {");
1832       Label L;
1833       __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1834       __ bge(lh, t1, L);
1835       __ stop("must be a primitive array");
1836       __ bind(L);
1837       BLOCK_COMMENT("} assert primitive array done");
1838     }
1839 #endif
1840 
1841     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1842                            t1, L_failed);
1843 
1844     // TypeArrayKlass
1845     //
1846     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1847     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1848     //
1849 
1850     const Register t0_offset = t0;    // array offset
1851     const Register x30_elsize = lh;   // element size
1852 
1853     // Get array_header_in_bytes()
1854     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1855     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1856     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1857     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1858 
1859     __ add(src, src, t0_offset);           // src array offset
1860     __ add(dst, dst, t0_offset);           // dst array offset
1861     BLOCK_COMMENT("choose copy loop based on element size");
1862 
1863     // next registers should be set before the jump to corresponding stub
1864     const Register from     = c_rarg0;  // source array address
1865     const Register to       = c_rarg1;  // destination array address
1866     const Register count    = c_rarg2;  // elements count
1867 
1868     // 'from', 'to', 'count' registers should be set in such order
1869     // since they are the same as 'src', 'src_pos', 'dst'.
1870 
1871     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1872 
1873     // The possible values of elsize are 0-3, i.e. exact_log2(element
1874     // size in bytes).  We do a simple bitwise binary search.
1875   __ BIND(L_copy_bytes);
1876     __ test_bit(t0, x30_elsize, 1);
1877     __ bnez(t0, L_copy_ints);
1878     __ test_bit(t0, x30_elsize, 0);
1879     __ bnez(t0, L_copy_shorts);
1880     __ add(from, src, src_pos); // src_addr
1881     __ add(to, dst, dst_pos); // dst_addr
1882     __ sign_extend(count, scratch_length, 32); // length
1883     __ j(RuntimeAddress(byte_copy_entry));
1884 
1885   __ BIND(L_copy_shorts);
1886     __ shadd(from, src_pos, src, t0, 1); // src_addr
1887     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1888     __ sign_extend(count, scratch_length, 32); // length
1889     __ j(RuntimeAddress(short_copy_entry));
1890 
1891   __ BIND(L_copy_ints);
1892     __ test_bit(t0, x30_elsize, 0);
1893     __ bnez(t0, L_copy_longs);
1894     __ shadd(from, src_pos, src, t0, 2); // src_addr
1895     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
1896     __ sign_extend(count, scratch_length, 32); // length
1897     __ j(RuntimeAddress(int_copy_entry));
1898 
1899   __ BIND(L_copy_longs);
1900 #ifdef ASSERT
1901     {
1902       BLOCK_COMMENT("assert long copy {");
1903       Label L;
1904       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
1905       __ sign_extend(lh, lh, 32);
1906       __ mv(t0, LogBytesPerLong);
1907       __ beq(x30_elsize, t0, L);
1908       __ stop("must be long copy, but elsize is wrong");
1909       __ bind(L);
1910       BLOCK_COMMENT("} assert long copy done");
1911     }
1912 #endif
1913     __ shadd(from, src_pos, src, t0, 3); // src_addr
1914     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
1915     __ sign_extend(count, scratch_length, 32); // length
1916     __ j(RuntimeAddress(long_copy_entry));
1917 
1918     // ObjArrayKlass
1919   __ BIND(L_objArray);
1920     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1921 
1922     Label L_plain_copy, L_checkcast_copy;
1923     // test array classes for subtyping
1924     __ load_klass(t2, dst);
1925     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
1926 
1927     // Identically typed arrays can be copied without element-wise checks.
1928     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1929                            t1, L_failed);
1930 
1931     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1932     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1933     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1934     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1935     __ sign_extend(count, scratch_length, 32); // length
1936   __ BIND(L_plain_copy);
1937     __ j(RuntimeAddress(oop_copy_entry));
1938 
1939   __ BIND(L_checkcast_copy);
1940     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
1941     {
1942       // Before looking at dst.length, make sure dst is also an objArray.
1943       __ lwu(t0, Address(t2, lh_offset));
1944       __ mv(t1, objArray_lh);
1945       __ bne(t0, t1, L_failed);
1946 
1947       // It is safe to examine both src.length and dst.length.
1948       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1949                              t2, L_failed);
1950 
1951       __ load_klass(dst_klass, dst); // reload
1952 
1953       // Marshal the base address arguments now, freeing registers.
1954       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1955       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1956       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1957       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1958       __ sign_extend(count, length, 32);      // length (reloaded)
1959       const Register sco_temp = c_rarg3;      // this register is free now
1960       assert_different_registers(from, to, count, sco_temp,
1961                                  dst_klass, scratch_src_klass);
1962 
1963       // Generate the type check.
1964       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1965       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1966 
1967       // Smashes t0, t1
1968       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
1969 
1970       // Fetch destination element klass from the ObjArrayKlass header.
1971       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1972       __ ld(dst_klass, Address(dst_klass, ek_offset));
1973       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1974 
1975       // the checkcast_copy loop needs two extra arguments:
1976       assert(c_rarg3 == sco_temp, "#3 already in place");
1977       // Set up arguments for checkcast_copy_entry.
1978       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
1979       __ j(RuntimeAddress(checkcast_copy_entry));
1980     }
1981 
1982   __ BIND(L_failed);
1983     __ mv(x10, -1);
1984     __ leave();   // required for proper stackwalking of RuntimeStub frame
1985     __ ret();
1986 
1987     return start;
1988   }
1989 
1990   //
1991   // Generate stub for array fill. If "aligned" is true, the
1992   // "to" address is assumed to be heapword aligned.
1993   //
1994   // Arguments for generated stub:
1995   //   to:    c_rarg0
1996   //   value: c_rarg1
1997   //   count: c_rarg2 treated as signed
1998   //
1999   address generate_fill(BasicType t, bool aligned, const char* name) {
2000     __ align(CodeEntryAlignment);
2001     StubCodeMark mark(this, "StubRoutines", name);
2002     address start = __ pc();
2003 
2004     BLOCK_COMMENT("Entry:");
2005 
2006     const Register to        = c_rarg0;  // source array address
2007     const Register value     = c_rarg1;  // value
2008     const Register count     = c_rarg2;  // elements count
2009 
2010     const Register bz_base   = x28;      // base for block_zero routine
2011     const Register cnt_words = x29;      // temp register
2012     const Register tmp_reg   = t1;
2013 
2014     __ enter();
2015 
2016     Label L_fill_elements, L_exit1;
2017 
2018     int shift = -1;
2019     switch (t) {
2020       case T_BYTE:
2021         shift = 0;
2022 
2023         // Zero extend value
2024         // 8 bit -> 16 bit
2025         __ andi(value, value, 0xff);
2026         __ mv(tmp_reg, value);
2027         __ slli(tmp_reg, tmp_reg, 8);
2028         __ orr(value, value, tmp_reg);
2029 
2030         // 16 bit -> 32 bit
2031         __ mv(tmp_reg, value);
2032         __ slli(tmp_reg, tmp_reg, 16);
2033         __ orr(value, value, tmp_reg);
2034 
2035         __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2036         __ bltu(count, tmp_reg, L_fill_elements);
2037         break;
2038       case T_SHORT:
2039         shift = 1;
2040         // Zero extend value
2041         // 16 bit -> 32 bit
2042         __ andi(value, value, 0xffff);
2043         __ mv(tmp_reg, value);
2044         __ slli(tmp_reg, tmp_reg, 16);
2045         __ orr(value, value, tmp_reg);
2046 
2047         // Short arrays (< 8 bytes) fill by element
2048         __ mv(tmp_reg, 8 >> shift);
2049         __ bltu(count, tmp_reg, L_fill_elements);
2050         break;
2051       case T_INT:
2052         shift = 2;
2053 
2054         // Short arrays (< 8 bytes) fill by element
2055         __ mv(tmp_reg, 8 >> shift);
2056         __ bltu(count, tmp_reg, L_fill_elements);
2057         break;
2058       default: ShouldNotReachHere();
2059     }
2060 
2061     // Align source address at 8 bytes address boundary.
2062     Label L_skip_align1, L_skip_align2, L_skip_align4;
2063     if (!aligned) {
2064       switch (t) {
2065         case T_BYTE:
2066           // One byte misalignment happens only for byte arrays.
2067           __ test_bit(t0, to, 0);
2068           __ beqz(t0, L_skip_align1);
2069           __ sb(value, Address(to, 0));
2070           __ addi(to, to, 1);
2071           __ addiw(count, count, -1);
2072           __ bind(L_skip_align1);
2073           // Fallthrough
2074         case T_SHORT:
2075           // Two bytes misalignment happens only for byte and short (char) arrays.
2076           __ test_bit(t0, to, 1);
2077           __ beqz(t0, L_skip_align2);
2078           __ sh(value, Address(to, 0));
2079           __ addi(to, to, 2);
2080           __ addiw(count, count, -(2 >> shift));
2081           __ bind(L_skip_align2);
2082           // Fallthrough
2083         case T_INT:
2084           // Align to 8 bytes, we know we are 4 byte aligned to start.
2085           __ test_bit(t0, to, 2);
2086           __ beqz(t0, L_skip_align4);
2087           __ sw(value, Address(to, 0));
2088           __ addi(to, to, 4);
2089           __ addiw(count, count, -(4 >> shift));
2090           __ bind(L_skip_align4);
2091           break;
2092         default: ShouldNotReachHere();
2093       }
2094     }
2095 
2096     //
2097     //  Fill large chunks
2098     //
2099     __ srliw(cnt_words, count, 3 - shift); // number of words
2100 
2101     // 32 bit -> 64 bit
2102     __ andi(value, value, 0xffffffff);
2103     __ mv(tmp_reg, value);
2104     __ slli(tmp_reg, tmp_reg, 32);
2105     __ orr(value, value, tmp_reg);
2106 
2107     __ slli(tmp_reg, cnt_words, 3 - shift);
2108     __ subw(count, count, tmp_reg);
2109     {
2110       __ fill_words(to, cnt_words, value);
2111     }
2112 
2113     // Remaining count is less than 8 bytes. Fill it by a single store.
2114     // Note that the total length is no less than 8 bytes.
2115     if (t == T_BYTE || t == T_SHORT) {
2116       __ beqz(count, L_exit1);
2117       __ shadd(to, count, to, tmp_reg, shift); // points to the end
2118       __ sd(value, Address(to, -8)); // overwrite some elements
2119       __ bind(L_exit1);
2120       __ leave();
2121       __ ret();
2122     }
2123 
2124     // Handle copies less than 8 bytes.
2125     Label L_fill_2, L_fill_4, L_exit2;
2126     __ bind(L_fill_elements);
2127     switch (t) {
2128       case T_BYTE:
2129         __ test_bit(t0, count, 0);
2130         __ beqz(t0, L_fill_2);
2131         __ sb(value, Address(to, 0));
2132         __ addi(to, to, 1);
2133         __ bind(L_fill_2);
2134         __ test_bit(t0, count, 1);
2135         __ beqz(t0, L_fill_4);
2136         __ sh(value, Address(to, 0));
2137         __ addi(to, to, 2);
2138         __ bind(L_fill_4);
2139         __ test_bit(t0, count, 2);
2140         __ beqz(t0, L_exit2);
2141         __ sw(value, Address(to, 0));
2142         break;
2143       case T_SHORT:
2144         __ test_bit(t0, count, 0);
2145         __ beqz(t0, L_fill_4);
2146         __ sh(value, Address(to, 0));
2147         __ addi(to, to, 2);
2148         __ bind(L_fill_4);
2149         __ test_bit(t0, count, 1);
2150         __ beqz(t0, L_exit2);
2151         __ sw(value, Address(to, 0));
2152         break;
2153       case T_INT:
2154         __ beqz(count, L_exit2);
2155         __ sw(value, Address(to, 0));
2156         break;
2157       default: ShouldNotReachHere();
2158     }
2159     __ bind(L_exit2);
2160     __ leave();
2161     __ ret();
2162     return start;
2163   }
2164 
2165   void generate_arraycopy_stubs() {
2166     address entry                     = nullptr;
2167     address entry_jbyte_arraycopy     = nullptr;
2168     address entry_jshort_arraycopy    = nullptr;
2169     address entry_jint_arraycopy      = nullptr;
2170     address entry_oop_arraycopy       = nullptr;
2171     address entry_jlong_arraycopy     = nullptr;
2172     address entry_checkcast_arraycopy = nullptr;
2173 
2174     generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards);
2175     generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards);
2176 
2177     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2178 
2179     //*** jbyte
2180     // Always need aligned and unaligned versions
2181     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, &entry,
2182                                                                                    "jbyte_disjoint_arraycopy");
2183     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_byte_copy(false, entry,
2184                                                                                    &entry_jbyte_arraycopy,
2185                                                                                    "jbyte_arraycopy");
2186     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, &entry,
2187                                                                                    "arrayof_jbyte_disjoint_arraycopy");
2188     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_byte_copy(true, entry, nullptr,
2189                                                                                    "arrayof_jbyte_arraycopy");
2190 
2191     //*** jshort
2192     // Always need aligned and unaligned versions
2193     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2194                                                                                     "jshort_disjoint_arraycopy");
2195     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2196                                                                                     &entry_jshort_arraycopy,
2197                                                                                     "jshort_arraycopy");
2198     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2199                                                                                     "arrayof_jshort_disjoint_arraycopy");
2200     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2201                                                                                     "arrayof_jshort_arraycopy");
2202 
2203     //*** jint
2204     // Aligned versions
2205     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_copy(true, &entry,
2206                                                                                   "arrayof_jint_disjoint_arraycopy");
2207     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2208                                                                                   "arrayof_jint_arraycopy");
2209     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2210     // entry_jint_arraycopy always points to the unaligned version
2211     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_copy(false, &entry,
2212                                                                                   "jint_disjoint_arraycopy");
2213     StubRoutines::_jint_arraycopy                    = generate_conjoint_int_copy(false, entry,
2214                                                                                   &entry_jint_arraycopy,
2215                                                                                   "jint_arraycopy");
2216 
2217     //*** jlong
2218     // It is always aligned
2219     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, &entry,
2220                                                                                    "arrayof_jlong_disjoint_arraycopy");
2221     StubRoutines::_arrayof_jlong_arraycopy           = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2222                                                                                    "arrayof_jlong_arraycopy");
2223     StubRoutines::_jlong_disjoint_arraycopy          = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2224     StubRoutines::_jlong_arraycopy                   = StubRoutines::_arrayof_jlong_arraycopy;
2225 
2226     //*** oops
2227     {
2228       // With compressed oops we need unaligned versions; notice that
2229       // we overwrite entry_oop_arraycopy.
2230       bool aligned = !UseCompressedOops;
2231 
2232       StubRoutines::_arrayof_oop_disjoint_arraycopy
2233         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2234                                      /*dest_uninitialized*/false);
2235       StubRoutines::_arrayof_oop_arraycopy
2236         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2237                                      /*dest_uninitialized*/false);
2238       // Aligned versions without pre-barriers
2239       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2240         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2241                                      /*dest_uninitialized*/true);
2242       StubRoutines::_arrayof_oop_arraycopy_uninit
2243         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2244                                      /*dest_uninitialized*/true);
2245     }
2246 
2247     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2248     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2249     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2250     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2251 
2252     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2253     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2254                                                                         /*dest_uninitialized*/true);
2255 
2256 
2257     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2258                                                               entry_jbyte_arraycopy,
2259                                                               entry_jshort_arraycopy,
2260                                                               entry_jint_arraycopy,
2261                                                               entry_jlong_arraycopy);
2262 
2263     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2264                                                                entry_jbyte_arraycopy,
2265                                                                entry_jshort_arraycopy,
2266                                                                entry_jint_arraycopy,
2267                                                                entry_oop_arraycopy,
2268                                                                entry_jlong_arraycopy,
2269                                                                entry_checkcast_arraycopy);
2270 
2271     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2272     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2273     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2274     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2275     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2276     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2277   }
2278 
2279   void generate_aes_loadkeys(const Register &key, VectorRegister *working_vregs, int rounds) {
2280     const int step = 16;
2281     for (int i = 0; i < rounds; i++) {
2282       __ vle32_v(working_vregs[i], key);
2283       // The keys are stored in little-endian array, while we need
2284       // to operate in big-endian.
2285       // So performing an endian-swap here with vrev8.v instruction
2286       __ vrev8_v(working_vregs[i], working_vregs[i]);
2287       __ addi(key, key, step);
2288     }
2289   }
2290 
2291   void generate_aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2292     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2293 
2294     __ vxor_vv(res, res, working_vregs[0]);
2295     for (int i = 1; i < rounds - 1; i++) {
2296       __ vaesem_vv(res, working_vregs[i]);
2297     }
2298     __ vaesef_vv(res, working_vregs[rounds - 1]);
2299   }
2300 
2301   // Arguments:
2302   //
2303   // Inputs:
2304   //   c_rarg0   - source byte array address
2305   //   c_rarg1   - destination byte array address
2306   //   c_rarg2   - K (key) in little endian int array
2307   //
2308   address generate_aescrypt_encryptBlock() {
2309     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2310 
2311     __ align(CodeEntryAlignment);
2312     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2313 
2314     Label L_aes128, L_aes192;
2315 
2316     const Register from        = c_rarg0;  // source array address
2317     const Register to          = c_rarg1;  // destination array address
2318     const Register key         = c_rarg2;  // key array address
2319     const Register keylen      = c_rarg3;
2320 
2321     VectorRegister working_vregs[] = {
2322       v4, v5, v6, v7, v8, v9, v10, v11,
2323       v12, v13, v14, v15, v16, v17, v18
2324     };
2325     const VectorRegister res   = v19;
2326 
2327     address start = __ pc();
2328     __ enter();
2329 
2330     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2331 
2332     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2333     __ vle32_v(res, from);
2334 
2335     __ mv(t2, 52);
2336     __ blt(keylen, t2, L_aes128);
2337     __ beq(keylen, t2, L_aes192);
2338     // Else we fallthrough to the biggest case (256-bit key size)
2339 
2340     // Note: the following function performs key += 15*16
2341     generate_aes_loadkeys(key, working_vregs, 15);
2342     generate_aes_encrypt(res, working_vregs, 15);
2343     __ vse32_v(res, to);
2344     __ mv(c_rarg0, 0);
2345     __ leave();
2346     __ ret();
2347 
2348   __ bind(L_aes192);
2349     // Note: the following function performs key += 13*16
2350     generate_aes_loadkeys(key, working_vregs, 13);
2351     generate_aes_encrypt(res, working_vregs, 13);
2352     __ vse32_v(res, to);
2353     __ mv(c_rarg0, 0);
2354     __ leave();
2355     __ ret();
2356 
2357   __ bind(L_aes128);
2358     // Note: the following function performs key += 11*16
2359     generate_aes_loadkeys(key, working_vregs, 11);
2360     generate_aes_encrypt(res, working_vregs, 11);
2361     __ vse32_v(res, to);
2362     __ mv(c_rarg0, 0);
2363     __ leave();
2364     __ ret();
2365 
2366     return start;
2367   }
2368 
2369   void generate_aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2370     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2371 
2372     __ vxor_vv(res, res, working_vregs[rounds - 1]);
2373     for (int i = rounds - 2; i > 0; i--) {
2374       __ vaesdm_vv(res, working_vregs[i]);
2375     }
2376     __ vaesdf_vv(res, working_vregs[0]);
2377   }
2378 
2379   // Arguments:
2380   //
2381   // Inputs:
2382   //   c_rarg0   - source byte array address
2383   //   c_rarg1   - destination byte array address
2384   //   c_rarg2   - K (key) in little endian int array
2385   //
2386   address generate_aescrypt_decryptBlock() {
2387     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2388 
2389     __ align(CodeEntryAlignment);
2390     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2391 
2392     Label L_aes128, L_aes192;
2393 
2394     const Register from        = c_rarg0;  // source array address
2395     const Register to          = c_rarg1;  // destination array address
2396     const Register key         = c_rarg2;  // key array address
2397     const Register keylen      = c_rarg3;
2398 
2399     VectorRegister working_vregs[] = {
2400       v4, v5, v6, v7, v8, v9, v10, v11,
2401       v12, v13, v14, v15, v16, v17, v18
2402     };
2403     const VectorRegister res   = v19;
2404 
2405     address start = __ pc();
2406     __ enter(); // required for proper stackwalking of RuntimeStub frame
2407 
2408     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2409 
2410     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2411     __ vle32_v(res, from);
2412 
2413     __ mv(t2, 52);
2414     __ blt(keylen, t2, L_aes128);
2415     __ beq(keylen, t2, L_aes192);
2416     // Else we fallthrough to the biggest case (256-bit key size)
2417 
2418     // Note: the following function performs key += 15*16
2419     generate_aes_loadkeys(key, working_vregs, 15);
2420     generate_aes_decrypt(res, working_vregs, 15);
2421     __ vse32_v(res, to);
2422     __ mv(c_rarg0, 0);
2423     __ leave();
2424     __ ret();
2425 
2426   __ bind(L_aes192);
2427     // Note: the following function performs key += 13*16
2428     generate_aes_loadkeys(key, working_vregs, 13);
2429     generate_aes_decrypt(res, working_vregs, 13);
2430     __ vse32_v(res, to);
2431     __ mv(c_rarg0, 0);
2432     __ leave();
2433     __ ret();
2434 
2435   __ bind(L_aes128);
2436     // Note: the following function performs key += 11*16
2437     generate_aes_loadkeys(key, working_vregs, 11);
2438     generate_aes_decrypt(res, working_vregs, 11);
2439     __ vse32_v(res, to);
2440     __ mv(c_rarg0, 0);
2441     __ leave();
2442     __ ret();
2443 
2444     return start;
2445   }
2446 
2447   // code for comparing 16 bytes of strings with same encoding
2448   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
2449     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31;
2450     __ ld(tmp5, Address(str1));
2451     __ addi(str1, str1, 8);
2452     __ xorr(tmp4, tmp1, tmp2);
2453     __ ld(cnt1, Address(str2));
2454     __ addi(str2, str2, 8);
2455     __ bnez(tmp4, DIFF1);
2456     __ ld(tmp1, Address(str1));
2457     __ addi(str1, str1, 8);
2458     __ xorr(tmp4, tmp5, cnt1);
2459     __ ld(tmp2, Address(str2));
2460     __ addi(str2, str2, 8);
2461     __ bnez(tmp4, DIFF2);
2462   }
2463 
2464   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
2465   void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) {
2466     const Register tmp = x30, tmpLval = x12;
2467     __ ld(tmpLval, Address(strL));
2468     __ addi(strL, strL, wordSize);
2469     __ ld(tmpU, Address(strU));
2470     __ addi(strU, strU, wordSize);
2471     __ inflate_lo32(tmpL, tmpLval);
2472     __ xorr(tmp, tmpU, tmpL);
2473     __ bnez(tmp, DIFF);
2474 
2475     __ ld(tmpU, Address(strU));
2476     __ addi(strU, strU, wordSize);
2477     __ inflate_hi32(tmpL, tmpLval);
2478     __ xorr(tmp, tmpU, tmpL);
2479     __ bnez(tmp, DIFF);
2480   }
2481 
2482   // x10  = result
2483   // x11  = str1
2484   // x12  = cnt1
2485   // x13  = str2
2486   // x14  = cnt2
2487   // x28  = tmp1
2488   // x29  = tmp2
2489   // x30  = tmp3
2490   address generate_compare_long_string_different_encoding(bool isLU) {
2491     __ align(CodeEntryAlignment);
2492     StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL");
2493     address entry = __ pc();
2494     Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
2495     const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
2496                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
2497 
2498     // cnt2 == amount of characters left to compare
2499     // Check already loaded first 4 symbols
2500     __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2);
2501     __ mv(isLU ? tmp1 : tmp2, tmp3);
2502     __ addi(str1, str1, isLU ? wordSize / 2 : wordSize);
2503     __ addi(str2, str2, isLU ? wordSize : wordSize / 2);
2504     __ sub(cnt2, cnt2, wordSize / 2); // Already loaded 4 symbols
2505 
2506     __ xorr(tmp3, tmp1, tmp2);
2507     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2508 
2509     Register strU = isLU ? str2 : str1,
2510              strL = isLU ? str1 : str2,
2511              tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
2512              tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
2513 
2514     // make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL
2515     // cnt2 is >= 68 here, no need to check it for >= 0
2516     __ lwu(tmpL, Address(strL));
2517     __ addi(strL, strL, wordSize / 2);
2518     __ ld(tmpU, Address(strU));
2519     __ addi(strU, strU, wordSize);
2520     __ inflate_lo32(tmp3, tmpL);
2521     __ mv(tmpL, tmp3);
2522     __ xorr(tmp3, tmpU, tmpL);
2523     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2524     __ addi(cnt2, cnt2, -wordSize / 2);
2525 
2526     // we are now 8-bytes aligned on strL
2527     __ sub(cnt2, cnt2, wordSize * 2);
2528     __ bltz(cnt2, TAIL);
2529     __ bind(SMALL_LOOP); // smaller loop
2530       __ sub(cnt2, cnt2, wordSize * 2);
2531       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2532       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2533       __ bgez(cnt2, SMALL_LOOP);
2534       __ addi(t0, cnt2, wordSize * 2);
2535       __ beqz(t0, DONE);
2536     __ bind(TAIL);  // 1..15 characters left
2537       // Aligned access. Load bytes in portions - 4, 2, 1.
2538 
2539       __ addi(t0, cnt2, wordSize);
2540       __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
2541       __ bltz(t0, LOAD_LAST);
2542       // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
2543       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2544       __ addi(cnt2, cnt2, -wordSize);
2545       __ beqz(cnt2, DONE);  // no character left
2546       __ bind(LOAD_LAST);   // cnt2 = 1..7 characters left
2547 
2548       __ addi(cnt2, cnt2, -wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
2549       __ slli(t0, cnt2, 1);     // t0 is now an offset in strU which points to last 16 bytes
2550       __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
2551       __ add(strU, strU, t0);   // Address of last 16 bytes in UTF-16 string
2552       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2553       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2554       __ inflate_lo32(tmp3, tmpL);
2555       __ mv(tmpL, tmp3);
2556       __ xorr(tmp3, tmpU, tmpL);
2557       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2558 
2559       __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
2560       __ addi(strU, strU, wordSize);   // Address of last 8 bytes in UTF-16 string
2561       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2562       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2563       __ inflate_lo32(tmp3, tmpL);
2564       __ mv(tmpL, tmp3);
2565       __ xorr(tmp3, tmpU, tmpL);
2566       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2567       __ j(DONE); // no character left
2568 
2569       // Find the first different characters in the longwords and
2570       // compute their difference.
2571     __ bind(CALCULATE_DIFFERENCE);
2572       __ ctzc_bit(tmp4, tmp3);
2573       __ srl(tmp1, tmp1, tmp4);
2574       __ srl(tmp2, tmp2, tmp4);
2575       __ andi(tmp1, tmp1, 0xFFFF);
2576       __ andi(tmp2, tmp2, 0xFFFF);
2577       __ sub(result, tmp1, tmp2);
2578     __ bind(DONE);
2579       __ ret();
2580     return entry;
2581   }
2582 
2583   address generate_method_entry_barrier() {
2584     __ align(CodeEntryAlignment);
2585     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
2586 
2587     Label deoptimize_label;
2588 
2589     address start = __ pc();
2590 
2591     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2592 
2593     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
2594       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
2595       Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
2596       __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
2597       __ lwu(t1, t1);
2598       __ sw(t1, thread_epoch_addr);
2599       // There are two ways this can work:
2600       // - The writer did system icache shootdown after the instruction stream update.
2601       //   Hence do nothing.
2602       // - The writer trust us to make sure our icache is in sync before entering.
2603       //   Hence use cmodx fence (fence.i, may change).
2604       if (UseCtxFencei) {
2605         __ cmodx_fence();
2606       }
2607       __ membar(__ LoadLoad);
2608     }
2609 
2610     __ set_last_Java_frame(sp, fp, ra);
2611 
2612     __ enter();
2613     __ add(t1, sp, wordSize);
2614 
2615     __ sub(sp, sp, 4 * wordSize);
2616 
2617     __ push_call_clobbered_registers();
2618 
2619     __ mv(c_rarg0, t1);
2620     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
2621 
2622     __ reset_last_Java_frame(true);
2623 
2624     __ mv(t0, x10);
2625 
2626     __ pop_call_clobbered_registers();
2627 
2628     __ bnez(t0, deoptimize_label);
2629 
2630     __ leave();
2631     __ ret();
2632 
2633     __ BIND(deoptimize_label);
2634 
2635     __ ld(t0, Address(sp, 0));
2636     __ ld(fp, Address(sp, wordSize));
2637     __ ld(ra, Address(sp, wordSize * 2));
2638     __ ld(t1, Address(sp, wordSize * 3));
2639 
2640     __ mv(sp, t0);
2641     __ jr(t1);
2642 
2643     return start;
2644   }
2645 
2646   // x10  = result
2647   // x11  = str1
2648   // x12  = cnt1
2649   // x13  = str2
2650   // x14  = cnt2
2651   // x28  = tmp1
2652   // x29  = tmp2
2653   // x30  = tmp3
2654   // x31  = tmp4
2655   address generate_compare_long_string_same_encoding(bool isLL) {
2656     __ align(CodeEntryAlignment);
2657     StubCodeMark mark(this, "StubRoutines", isLL ?
2658                       "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU");
2659     address entry = __ pc();
2660     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
2661           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
2662     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2663                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2664     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2665 
2666     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
2667     // update cnt2 counter with already loaded 8 bytes
2668     __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
2669     // update pointers, because of previous read
2670     __ add(str1, str1, wordSize);
2671     __ add(str2, str2, wordSize);
2672     // less than 16 bytes left?
2673     __ sub(cnt2, cnt2, isLL ? 16 : 8);
2674     __ push_reg(spilled_regs, sp);
2675     __ bltz(cnt2, TAIL);
2676     __ bind(SMALL_LOOP);
2677       compare_string_16_bytes_same(DIFF, DIFF2);
2678       __ sub(cnt2, cnt2, isLL ? 16 : 8);
2679       __ bgez(cnt2, SMALL_LOOP);
2680     __ bind(TAIL);
2681       __ addi(cnt2, cnt2, isLL ? 16 : 8);
2682       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
2683       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2684       __ blez(cnt2, CHECK_LAST);
2685       __ xorr(tmp4, tmp1, tmp2);
2686       __ bnez(tmp4, DIFF);
2687       __ ld(tmp1, Address(str1));
2688       __ addi(str1, str1, 8);
2689       __ ld(tmp2, Address(str2));
2690       __ addi(str2, str2, 8);
2691       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2692     __ bind(CHECK_LAST);
2693       if (!isLL) {
2694         __ add(cnt2, cnt2, cnt2); // now in bytes
2695       }
2696       __ xorr(tmp4, tmp1, tmp2);
2697       __ bnez(tmp4, DIFF);
2698       __ add(str1, str1, cnt2);
2699       __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
2700       __ add(str2, str2, cnt2);
2701       __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
2702       __ xorr(tmp4, tmp5, cnt1);
2703       __ beqz(tmp4, LENGTH_DIFF);
2704       // Find the first different characters in the longwords and
2705       // compute their difference.
2706     __ bind(DIFF2);
2707       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2708       __ srl(tmp5, tmp5, tmp3);
2709       __ srl(cnt1, cnt1, tmp3);
2710       if (isLL) {
2711         __ andi(tmp5, tmp5, 0xFF);
2712         __ andi(cnt1, cnt1, 0xFF);
2713       } else {
2714         __ andi(tmp5, tmp5, 0xFFFF);
2715         __ andi(cnt1, cnt1, 0xFFFF);
2716       }
2717       __ sub(result, tmp5, cnt1);
2718       __ j(LENGTH_DIFF);
2719     __ bind(DIFF);
2720       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2721       __ srl(tmp1, tmp1, tmp3);
2722       __ srl(tmp2, tmp2, tmp3);
2723       if (isLL) {
2724         __ andi(tmp1, tmp1, 0xFF);
2725         __ andi(tmp2, tmp2, 0xFF);
2726       } else {
2727         __ andi(tmp1, tmp1, 0xFFFF);
2728         __ andi(tmp2, tmp2, 0xFFFF);
2729       }
2730       __ sub(result, tmp1, tmp2);
2731       __ j(LENGTH_DIFF);
2732     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
2733       __ xorr(tmp4, tmp1, tmp2);
2734       __ bnez(tmp4, DIFF);
2735     __ bind(LENGTH_DIFF);
2736       __ pop_reg(spilled_regs, sp);
2737       __ ret();
2738     return entry;
2739   }
2740 
2741   void generate_compare_long_strings() {
2742     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true);
2743     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false);
2744     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true);
2745     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false);
2746   }
2747 
2748   // x10 result
2749   // x11 src
2750   // x12 src count
2751   // x13 pattern
2752   // x14 pattern count
2753   address generate_string_indexof_linear(bool needle_isL, bool haystack_isL)
2754   {
2755     const char* stubName = needle_isL
2756            ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul")
2757            : "indexof_linear_uu";
2758     __ align(CodeEntryAlignment);
2759     StubCodeMark mark(this, "StubRoutines", stubName);
2760     address entry = __ pc();
2761 
2762     int needle_chr_size = needle_isL ? 1 : 2;
2763     int haystack_chr_size = haystack_isL ? 1 : 2;
2764     int needle_chr_shift = needle_isL ? 0 : 1;
2765     int haystack_chr_shift = haystack_isL ? 0 : 1;
2766     bool isL = needle_isL && haystack_isL;
2767     // parameters
2768     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
2769     // temporary registers
2770     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
2771     // redefinitions
2772     Register ch1 = x28, ch2 = x29;
2773     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
2774 
2775     __ push_reg(spilled_regs, sp);
2776 
2777     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
2778           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
2779           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
2780           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
2781           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
2782           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
2783 
2784     __ ld(ch1, Address(needle));
2785     __ ld(ch2, Address(haystack));
2786     // src.length - pattern.length
2787     __ sub(haystack_len, haystack_len, needle_len);
2788 
2789     // first is needle[0]
2790     __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first);
2791     uint64_t mask0101 = UCONST64(0x0101010101010101);
2792     uint64_t mask0001 = UCONST64(0x0001000100010001);
2793     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
2794     __ mul(first, first, mask1);
2795     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
2796     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
2797     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
2798     if (needle_isL != haystack_isL) {
2799       __ mv(tmp, ch1);
2800     }
2801     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
2802     __ blez(haystack_len, L_SMALL);
2803 
2804     if (needle_isL != haystack_isL) {
2805       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2806     }
2807     // xorr, sub, orr, notr, andr
2808     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
2809     // eg:
2810     // first:        aa aa aa aa aa aa aa aa
2811     // ch2:          aa aa li nx jd ka aa aa
2812     // match_mask:   80 80 00 00 00 00 80 80
2813     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2814 
2815     // search first char of needle, if success, goto L_HAS_ZERO;
2816     __ bnez(match_mask, L_HAS_ZERO);
2817     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2818     __ add(result, result, wordSize / haystack_chr_size);
2819     __ add(haystack, haystack, wordSize);
2820     __ bltz(haystack_len, L_POST_LOOP);
2821 
2822     __ bind(L_LOOP);
2823     __ ld(ch2, Address(haystack));
2824     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2825     __ bnez(match_mask, L_HAS_ZERO);
2826 
2827     __ bind(L_LOOP_PROCEED);
2828     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2829     __ add(haystack, haystack, wordSize);
2830     __ add(result, result, wordSize / haystack_chr_size);
2831     __ bgez(haystack_len, L_LOOP);
2832 
2833     __ bind(L_POST_LOOP);
2834     __ mv(ch2, -wordSize / haystack_chr_size);
2835     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
2836     __ ld(ch2, Address(haystack));
2837     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2838     __ neg(haystack_len, haystack_len);
2839     __ xorr(ch2, first, ch2);
2840     __ sub(match_mask, ch2, mask1);
2841     __ orr(ch2, ch2, mask2);
2842     __ mv(trailing_zeros, -1); // all bits set
2843     __ j(L_SMALL_PROCEED);
2844 
2845     __ align(OptoLoopAlignment);
2846     __ bind(L_SMALL);
2847     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2848     __ neg(haystack_len, haystack_len);
2849     if (needle_isL != haystack_isL) {
2850       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2851     }
2852     __ xorr(ch2, first, ch2);
2853     __ sub(match_mask, ch2, mask1);
2854     __ orr(ch2, ch2, mask2);
2855     __ mv(trailing_zeros, -1); // all bits set
2856 
2857     __ bind(L_SMALL_PROCEED);
2858     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
2859     __ notr(ch2, ch2);
2860     __ andr(match_mask, match_mask, ch2);
2861     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
2862     __ beqz(match_mask, NOMATCH);
2863 
2864     __ bind(L_SMALL_HAS_ZERO_LOOP);
2865     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros
2866     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2867     __ mv(ch2, wordSize / haystack_chr_size);
2868     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
2869     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2870     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2871     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2872 
2873     __ bind(L_SMALL_CMP_LOOP);
2874     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
2875     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2876     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
2877     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2878     __ add(trailing_zeros, trailing_zeros, 1);
2879     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
2880     __ beq(first, ch2, L_SMALL_CMP_LOOP);
2881 
2882     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
2883     __ beqz(match_mask, NOMATCH);
2884     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2885     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2886     __ add(result, result, 1);
2887     __ add(haystack, haystack, haystack_chr_size);
2888     __ j(L_SMALL_HAS_ZERO_LOOP);
2889 
2890     __ align(OptoLoopAlignment);
2891     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
2892     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2893     __ j(DONE);
2894 
2895     __ align(OptoLoopAlignment);
2896     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
2897     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2898     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2899     __ j(DONE);
2900 
2901     __ align(OptoLoopAlignment);
2902     __ bind(L_HAS_ZERO);
2903     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2904     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2905     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
2906     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
2907     __ sub(result, result, 1); // array index from 0, so result -= 1
2908 
2909     __ bind(L_HAS_ZERO_LOOP);
2910     __ mv(needle_len, wordSize / haystack_chr_size);
2911     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
2912     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
2913     // load next 8 bytes from haystack, and increase result index
2914     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2915     __ add(result, result, 1);
2916     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2917     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2918 
2919     // compare one char
2920     __ bind(L_CMP_LOOP);
2921     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
2922     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
2923     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2924     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2925     __ add(trailing_zeros, trailing_zeros, 1); // next char index
2926     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
2927     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
2928     __ beq(needle_len, ch2, L_CMP_LOOP);
2929 
2930     __ bind(L_CMP_LOOP_NOMATCH);
2931     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
2932     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index
2933     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2934     __ add(haystack, haystack, haystack_chr_size);
2935     __ j(L_HAS_ZERO_LOOP);
2936 
2937     __ align(OptoLoopAlignment);
2938     __ bind(L_CMP_LOOP_LAST_CMP);
2939     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
2940     __ j(DONE);
2941 
2942     __ align(OptoLoopAlignment);
2943     __ bind(L_CMP_LOOP_LAST_CMP2);
2944     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2945     __ add(result, result, 1);
2946     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2947     __ j(DONE);
2948 
2949     __ align(OptoLoopAlignment);
2950     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
2951     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
2952     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
2953     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
2954     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
2955     // result by analyzed characters value, so, we can just reset lower bits
2956     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
2957     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
2958     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
2959     // index of last analyzed substring inside current octet. So, haystack in at
2960     // respective start address. We need to advance it to next octet
2961     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
2962     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
2963     __ andi(result, result, haystack_isL ? -8 : -4);
2964     __ slli(tmp, match_mask, haystack_chr_shift);
2965     __ sub(haystack, haystack, tmp);
2966     __ sign_extend(haystack_len, haystack_len, 32);
2967     __ j(L_LOOP_PROCEED);
2968 
2969     __ align(OptoLoopAlignment);
2970     __ bind(NOMATCH);
2971     __ mv(result, -1);
2972 
2973     __ bind(DONE);
2974     __ pop_reg(spilled_regs, sp);
2975     __ ret();
2976     return entry;
2977   }
2978 
2979   void generate_string_indexof_stubs()
2980   {
2981     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
2982     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
2983     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
2984   }
2985 
2986 #ifdef COMPILER2
2987   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
2988     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
2989 
2990     address start = __ pc();
2991     const Register
2992       r_super_klass  = x10,
2993       r_array_base   = x11,
2994       r_array_length = x12,
2995       r_array_index  = x13,
2996       r_sub_klass    = x14,
2997       result         = x15,
2998       r_bitmap       = x16;
2999 
3000     Label L_success;
3001     __ enter();
3002     __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, result,
3003                                      r_array_base, r_array_length, r_array_index,
3004                                      r_bitmap, super_klass_index, /*stub_is_near*/true);
3005     __ leave();
3006     __ ret();
3007 
3008     return start;
3009   }
3010 
3011   // Slow path implementation for UseSecondarySupersTable.
3012   address generate_lookup_secondary_supers_table_slow_path_stub() {
3013     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
3014 
3015     address start = __ pc();
3016     const Register
3017       r_super_klass  = x10,        // argument
3018       r_array_base   = x11,        // argument
3019       temp1          = x12,        // tmp
3020       r_array_index  = x13,        // argument
3021       result         = x15,        // argument
3022       r_bitmap       = x16;        // argument
3023 
3024 
3025     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
3026     __ ret();
3027 
3028     return start;
3029   }
3030 
3031   address generate_mulAdd()
3032   {
3033     __ align(CodeEntryAlignment);
3034     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3035 
3036     address entry = __ pc();
3037 
3038     const Register out     = x10;
3039     const Register in      = x11;
3040     const Register offset  = x12;
3041     const Register len     = x13;
3042     const Register k       = x14;
3043     const Register tmp     = x28;
3044 
3045     BLOCK_COMMENT("Entry:");
3046     __ enter();
3047     __ mul_add(out, in, offset, len, k, tmp);
3048     __ leave();
3049     __ ret();
3050 
3051     return entry;
3052   }
3053 
3054   /**
3055    *  Arguments:
3056    *
3057    *  Input:
3058    *    c_rarg0   - x address
3059    *    c_rarg1   - x length
3060    *    c_rarg2   - y address
3061    *    c_rarg3   - y length
3062    *    c_rarg4   - z address
3063    */
3064   address generate_multiplyToLen()
3065   {
3066     __ align(CodeEntryAlignment);
3067     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3068     address entry = __ pc();
3069 
3070     const Register x     = x10;
3071     const Register xlen  = x11;
3072     const Register y     = x12;
3073     const Register ylen  = x13;
3074     const Register z     = x14;
3075 
3076     const Register tmp0  = x15;
3077     const Register tmp1  = x16;
3078     const Register tmp2  = x17;
3079     const Register tmp3  = x7;
3080     const Register tmp4  = x28;
3081     const Register tmp5  = x29;
3082     const Register tmp6  = x30;
3083     const Register tmp7  = x31;
3084 
3085     BLOCK_COMMENT("Entry:");
3086     __ enter(); // required for proper stackwalking of RuntimeStub frame
3087     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3088     __ leave(); // required for proper stackwalking of RuntimeStub frame
3089     __ ret();
3090 
3091     return entry;
3092   }
3093 
3094   address generate_squareToLen()
3095   {
3096     __ align(CodeEntryAlignment);
3097     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3098     address entry = __ pc();
3099 
3100     const Register x     = x10;
3101     const Register xlen  = x11;
3102     const Register z     = x12;
3103     const Register y     = x14; // == x
3104     const Register ylen  = x15; // == xlen
3105 
3106     const Register tmp0  = x13; // zlen, unused
3107     const Register tmp1  = x16;
3108     const Register tmp2  = x17;
3109     const Register tmp3  = x7;
3110     const Register tmp4  = x28;
3111     const Register tmp5  = x29;
3112     const Register tmp6  = x30;
3113     const Register tmp7  = x31;
3114 
3115     BLOCK_COMMENT("Entry:");
3116     __ enter();
3117     __ mv(y, x);
3118     __ mv(ylen, xlen);
3119     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3120     __ leave();
3121     __ ret();
3122 
3123     return entry;
3124   }
3125 
3126   // Arguments:
3127   //
3128   // Input:
3129   //   c_rarg0   - newArr address
3130   //   c_rarg1   - oldArr address
3131   //   c_rarg2   - newIdx
3132   //   c_rarg3   - shiftCount
3133   //   c_rarg4   - numIter
3134   //
3135   address generate_bigIntegerLeftShift() {
3136     __ align(CodeEntryAlignment);
3137     StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
3138     address entry = __ pc();
3139 
3140     Label loop, exit;
3141 
3142     Register newArr        = c_rarg0;
3143     Register oldArr        = c_rarg1;
3144     Register newIdx        = c_rarg2;
3145     Register shiftCount    = c_rarg3;
3146     Register numIter       = c_rarg4;
3147 
3148     Register shiftRevCount = c_rarg5;
3149     Register oldArrNext    = t1;
3150 
3151     __ beqz(numIter, exit);
3152     __ shadd(newArr, newIdx, newArr, t0, 2);
3153 
3154     __ mv(shiftRevCount, 32);
3155     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3156 
3157     __ bind(loop);
3158     __ addi(oldArrNext, oldArr, 4);
3159     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
3160     __ vle32_v(v0, oldArr);
3161     __ vle32_v(v4, oldArrNext);
3162     __ vsll_vx(v0, v0, shiftCount);
3163     __ vsrl_vx(v4, v4, shiftRevCount);
3164     __ vor_vv(v0, v0, v4);
3165     __ vse32_v(v0, newArr);
3166     __ sub(numIter, numIter, t0);
3167     __ shadd(oldArr, t0, oldArr, t1, 2);
3168     __ shadd(newArr, t0, newArr, t1, 2);
3169     __ bnez(numIter, loop);
3170 
3171     __ bind(exit);
3172     __ ret();
3173 
3174     return entry;
3175   }
3176 
3177   // Arguments:
3178   //
3179   // Input:
3180   //   c_rarg0   - newArr address
3181   //   c_rarg1   - oldArr address
3182   //   c_rarg2   - newIdx
3183   //   c_rarg3   - shiftCount
3184   //   c_rarg4   - numIter
3185   //
3186   address generate_bigIntegerRightShift() {
3187     __ align(CodeEntryAlignment);
3188     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
3189     address entry = __ pc();
3190 
3191     Label loop, exit;
3192 
3193     Register newArr        = c_rarg0;
3194     Register oldArr        = c_rarg1;
3195     Register newIdx        = c_rarg2;
3196     Register shiftCount    = c_rarg3;
3197     Register numIter       = c_rarg4;
3198     Register idx           = numIter;
3199 
3200     Register shiftRevCount = c_rarg5;
3201     Register oldArrNext    = c_rarg6;
3202     Register newArrCur     = t0;
3203     Register oldArrCur     = t1;
3204 
3205     __ beqz(idx, exit);
3206     __ shadd(newArr, newIdx, newArr, t0, 2);
3207 
3208     __ mv(shiftRevCount, 32);
3209     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3210 
3211     __ bind(loop);
3212     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3213     __ sub(idx, idx, t0);
3214     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3215     __ shadd(newArrCur, idx, newArr, t1, 2);
3216     __ addi(oldArrCur, oldArrNext, 4);
3217     __ vle32_v(v0, oldArrCur);
3218     __ vle32_v(v4, oldArrNext);
3219     __ vsrl_vx(v0, v0, shiftCount);
3220     __ vsll_vx(v4, v4, shiftRevCount);
3221     __ vor_vv(v0, v0, v4);
3222     __ vse32_v(v0, newArrCur);
3223     __ bnez(idx, loop);
3224 
3225     __ bind(exit);
3226     __ ret();
3227 
3228     return entry;
3229   }
3230 #endif
3231 
3232 #ifdef COMPILER2
3233   class MontgomeryMultiplyGenerator : public MacroAssembler {
3234 
3235     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3236       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3237 
3238     RegSet _toSave;
3239     bool _squaring;
3240 
3241   public:
3242     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3243       : MacroAssembler(as->code()), _squaring(squaring) {
3244 
3245       // Register allocation
3246 
3247       RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3248       Pa_base = *regs;       // Argument registers
3249       if (squaring) {
3250         Pb_base = Pa_base;
3251       } else {
3252         Pb_base = *++regs;
3253       }
3254       Pn_base = *++regs;
3255       Rlen= *++regs;
3256       inv = *++regs;
3257       Pm_base = *++regs;
3258 
3259                         // Working registers:
3260       Ra =  *++regs;    // The current digit of a, b, n, and m.
3261       Rb =  *++regs;
3262       Rm =  *++regs;
3263       Rn =  *++regs;
3264 
3265       Pa =  *++regs;      // Pointers to the current/next digit of a, b, n, and m.
3266       Pb =  *++regs;
3267       Pm =  *++regs;
3268       Pn =  *++regs;
3269 
3270       tmp0 =  *++regs;    // Three registers which form a
3271       tmp1 =  *++regs;    // triple-precision accumuator.
3272       tmp2 =  *++regs;
3273 
3274       Ri =  x6;         // Inner and outer loop indexes.
3275       Rj =  x7;
3276 
3277       Rhi_ab = x28;     // Product registers: low and high parts
3278       Rlo_ab = x29;     // of a*b and m*n.
3279       Rhi_mn = x30;
3280       Rlo_mn = x31;
3281 
3282       // x18 and up are callee-saved.
3283       _toSave = RegSet::range(x18, *regs) + Pm_base;
3284     }
3285 
3286   private:
3287     void save_regs() {
3288       push_reg(_toSave, sp);
3289     }
3290 
3291     void restore_regs() {
3292       pop_reg(_toSave, sp);
3293     }
3294 
3295     template <typename T>
3296     void unroll_2(Register count, T block) {
3297       Label loop, end, odd;
3298       beqz(count, end);
3299       test_bit(t0, count, 0);
3300       bnez(t0, odd);
3301       align(16);
3302       bind(loop);
3303       (this->*block)();
3304       bind(odd);
3305       (this->*block)();
3306       addi(count, count, -2);
3307       bgtz(count, loop);
3308       bind(end);
3309     }
3310 
3311     template <typename T>
3312     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3313       Label loop, end, odd;
3314       beqz(count, end);
3315       test_bit(tmp, count, 0);
3316       bnez(tmp, odd);
3317       align(16);
3318       bind(loop);
3319       (this->*block)(d, s, tmp);
3320       bind(odd);
3321       (this->*block)(d, s, tmp);
3322       addi(count, count, -2);
3323       bgtz(count, loop);
3324       bind(end);
3325     }
3326 
3327     void pre1(RegisterOrConstant i) {
3328       block_comment("pre1");
3329       // Pa = Pa_base;
3330       // Pb = Pb_base + i;
3331       // Pm = Pm_base;
3332       // Pn = Pn_base + i;
3333       // Ra = *Pa;
3334       // Rb = *Pb;
3335       // Rm = *Pm;
3336       // Rn = *Pn;
3337       if (i.is_register()) {
3338         slli(t0, i.as_register(), LogBytesPerWord);
3339       } else {
3340         mv(t0, i.as_constant());
3341         slli(t0, t0, LogBytesPerWord);
3342       }
3343 
3344       mv(Pa, Pa_base);
3345       add(Pb, Pb_base, t0);
3346       mv(Pm, Pm_base);
3347       add(Pn, Pn_base, t0);
3348 
3349       ld(Ra, Address(Pa));
3350       ld(Rb, Address(Pb));
3351       ld(Rm, Address(Pm));
3352       ld(Rn, Address(Pn));
3353 
3354       // Zero the m*n result.
3355       mv(Rhi_mn, zr);
3356       mv(Rlo_mn, zr);
3357     }
3358 
3359     // The core multiply-accumulate step of a Montgomery
3360     // multiplication.  The idea is to schedule operations as a
3361     // pipeline so that instructions with long latencies (loads and
3362     // multiplies) have time to complete before their results are
3363     // used.  This most benefits in-order implementations of the
3364     // architecture but out-of-order ones also benefit.
3365     void step() {
3366       block_comment("step");
3367       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3368       // Ra = *++Pa;
3369       // Rb = *--Pb;
3370       mulhu(Rhi_ab, Ra, Rb);
3371       mul(Rlo_ab, Ra, Rb);
3372       addi(Pa, Pa, wordSize);
3373       ld(Ra, Address(Pa));
3374       addi(Pb, Pb, -wordSize);
3375       ld(Rb, Address(Pb));
3376       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
3377                                             // previous iteration.
3378       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3379       // Rm = *++Pm;
3380       // Rn = *--Pn;
3381       mulhu(Rhi_mn, Rm, Rn);
3382       mul(Rlo_mn, Rm, Rn);
3383       addi(Pm, Pm, wordSize);
3384       ld(Rm, Address(Pm));
3385       addi(Pn, Pn, -wordSize);
3386       ld(Rn, Address(Pn));
3387       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3388     }
3389 
3390     void post1() {
3391       block_comment("post1");
3392 
3393       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3394       // Ra = *++Pa;
3395       // Rb = *--Pb;
3396       mulhu(Rhi_ab, Ra, Rb);
3397       mul(Rlo_ab, Ra, Rb);
3398       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3399       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3400 
3401       // *Pm = Rm = tmp0 * inv;
3402       mul(Rm, tmp0, inv);
3403       sd(Rm, Address(Pm));
3404 
3405       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3406       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3407       mulhu(Rhi_mn, Rm, Rn);
3408 
3409 #ifndef PRODUCT
3410       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3411       {
3412         mul(Rlo_mn, Rm, Rn);
3413         add(Rlo_mn, tmp0, Rlo_mn);
3414         Label ok;
3415         beqz(Rlo_mn, ok);
3416         stop("broken Montgomery multiply");
3417         bind(ok);
3418       }
3419 #endif
3420       // We have very carefully set things up so that
3421       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3422       // the lower half of Rm * Rn because we know the result already:
3423       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3424       // tmp0 != 0.  So, rather than do a mul and an cad we just set
3425       // the carry flag iff tmp0 is nonzero.
3426       //
3427       // mul(Rlo_mn, Rm, Rn);
3428       // cad(zr, tmp0, Rlo_mn);
3429       addi(t0, tmp0, -1);
3430       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3431       cadc(tmp0, tmp1, Rhi_mn, t0);
3432       adc(tmp1, tmp2, zr, t0);
3433       mv(tmp2, zr);
3434     }
3435 
3436     void pre2(Register i, Register len) {
3437       block_comment("pre2");
3438       // Pa = Pa_base + i-len;
3439       // Pb = Pb_base + len;
3440       // Pm = Pm_base + i-len;
3441       // Pn = Pn_base + len;
3442 
3443       sub(Rj, i, len);
3444       // Rj == i-len
3445 
3446       // Ra as temp register
3447       slli(Ra, Rj, LogBytesPerWord);
3448       add(Pa, Pa_base, Ra);
3449       add(Pm, Pm_base, Ra);
3450       slli(Ra, len, LogBytesPerWord);
3451       add(Pb, Pb_base, Ra);
3452       add(Pn, Pn_base, Ra);
3453 
3454       // Ra = *++Pa;
3455       // Rb = *--Pb;
3456       // Rm = *++Pm;
3457       // Rn = *--Pn;
3458       add(Pa, Pa, wordSize);
3459       ld(Ra, Address(Pa));
3460       add(Pb, Pb, -wordSize);
3461       ld(Rb, Address(Pb));
3462       add(Pm, Pm, wordSize);
3463       ld(Rm, Address(Pm));
3464       add(Pn, Pn, -wordSize);
3465       ld(Rn, Address(Pn));
3466 
3467       mv(Rhi_mn, zr);
3468       mv(Rlo_mn, zr);
3469     }
3470 
3471     void post2(Register i, Register len) {
3472       block_comment("post2");
3473       sub(Rj, i, len);
3474 
3475       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
3476 
3477       // As soon as we know the least significant digit of our result,
3478       // store it.
3479       // Pm_base[i-len] = tmp0;
3480       // Rj as temp register
3481       slli(Rj, Rj, LogBytesPerWord);
3482       add(Rj, Pm_base, Rj);
3483       sd(tmp0, Address(Rj));
3484 
3485       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3486       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
3487       adc(tmp1, tmp2, zr, t0);
3488       mv(tmp2, zr);
3489     }
3490 
3491     // A carry in tmp0 after Montgomery multiplication means that we
3492     // should subtract multiples of n from our result in m.  We'll
3493     // keep doing that until there is no carry.
3494     void normalize(Register len) {
3495       block_comment("normalize");
3496       // while (tmp0)
3497       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
3498       Label loop, post, again;
3499       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
3500       beqz(tmp0, post); {
3501         bind(again); {
3502           mv(i, zr);
3503           mv(cnt, len);
3504           slli(Rn, i, LogBytesPerWord);
3505           add(Rm, Pm_base, Rn);
3506           ld(Rm, Address(Rm));
3507           add(Rn, Pn_base, Rn);
3508           ld(Rn, Address(Rn));
3509           mv(t0, 1); // set carry flag, i.e. no borrow
3510           align(16);
3511           bind(loop); {
3512             notr(Rn, Rn);
3513             add(Rm, Rm, t0);
3514             add(Rm, Rm, Rn);
3515             sltu(t0, Rm, Rn);
3516             slli(Rn, i, LogBytesPerWord); // Rn as temp register
3517             add(Rn, Pm_base, Rn);
3518             sd(Rm, Address(Rn));
3519             add(i, i, 1);
3520             slli(Rn, i, LogBytesPerWord);
3521             add(Rm, Pm_base, Rn);
3522             ld(Rm, Address(Rm));
3523             add(Rn, Pn_base, Rn);
3524             ld(Rn, Address(Rn));
3525             sub(cnt, cnt, 1);
3526           } bnez(cnt, loop);
3527           addi(tmp0, tmp0, -1);
3528           add(tmp0, tmp0, t0);
3529         } bnez(tmp0, again);
3530       } bind(post);
3531     }
3532 
3533     // Move memory at s to d, reversing words.
3534     //    Increments d to end of copied memory
3535     //    Destroys tmp1, tmp2
3536     //    Preserves len
3537     //    Leaves s pointing to the address which was in d at start
3538     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3539       assert(tmp1->encoding() < x28->encoding(), "register corruption");
3540       assert(tmp2->encoding() < x28->encoding(), "register corruption");
3541 
3542       shadd(s, len, s, tmp1, LogBytesPerWord);
3543       mv(tmp1, len);
3544       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3545       slli(tmp1, len, LogBytesPerWord);
3546       sub(s, d, tmp1);
3547     }
3548     // [63...0] -> [31...0][63...32]
3549     void reverse1(Register d, Register s, Register tmp) {
3550       addi(s, s, -wordSize);
3551       ld(tmp, Address(s));
3552       ror_imm(tmp, tmp, 32, t0);
3553       sd(tmp, Address(d));
3554       addi(d, d, wordSize);
3555     }
3556 
3557     void step_squaring() {
3558       // An extra ACC
3559       step();
3560       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3561     }
3562 
3563     void last_squaring(Register i) {
3564       Label dont;
3565       // if ((i & 1) == 0) {
3566       test_bit(t0, i, 0);
3567       bnez(t0, dont); {
3568         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3569         // Ra = *++Pa;
3570         // Rb = *--Pb;
3571         mulhu(Rhi_ab, Ra, Rb);
3572         mul(Rlo_ab, Ra, Rb);
3573         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3574       } bind(dont);
3575     }
3576 
3577     void extra_step_squaring() {
3578       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3579 
3580       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3581       // Rm = *++Pm;
3582       // Rn = *--Pn;
3583       mulhu(Rhi_mn, Rm, Rn);
3584       mul(Rlo_mn, Rm, Rn);
3585       addi(Pm, Pm, wordSize);
3586       ld(Rm, Address(Pm));
3587       addi(Pn, Pn, -wordSize);
3588       ld(Rn, Address(Pn));
3589     }
3590 
3591     void post1_squaring() {
3592       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3593 
3594       // *Pm = Rm = tmp0 * inv;
3595       mul(Rm, tmp0, inv);
3596       sd(Rm, Address(Pm));
3597 
3598       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3599       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3600       mulhu(Rhi_mn, Rm, Rn);
3601 
3602 #ifndef PRODUCT
3603       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3604       {
3605         mul(Rlo_mn, Rm, Rn);
3606         add(Rlo_mn, tmp0, Rlo_mn);
3607         Label ok;
3608         beqz(Rlo_mn, ok); {
3609           stop("broken Montgomery multiply");
3610         } bind(ok);
3611       }
3612 #endif
3613       // We have very carefully set things up so that
3614       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3615       // the lower half of Rm * Rn because we know the result already:
3616       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3617       // tmp0 != 0.  So, rather than do a mul and a cad we just set
3618       // the carry flag iff tmp0 is nonzero.
3619       //
3620       // mul(Rlo_mn, Rm, Rn);
3621       // cad(zr, tmp, Rlo_mn);
3622       addi(t0, tmp0, -1);
3623       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3624       cadc(tmp0, tmp1, Rhi_mn, t0);
3625       adc(tmp1, tmp2, zr, t0);
3626       mv(tmp2, zr);
3627     }
3628 
3629     // use t0 as carry
3630     void acc(Register Rhi, Register Rlo,
3631              Register tmp0, Register tmp1, Register tmp2) {
3632       cad(tmp0, tmp0, Rlo, t0);
3633       cadc(tmp1, tmp1, Rhi, t0);
3634       adc(tmp2, tmp2, zr, t0);
3635     }
3636 
3637   public:
3638     /**
3639      * Fast Montgomery multiplication.  The derivation of the
3640      * algorithm is in A Cryptographic Library for the Motorola
3641      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3642      *
3643      * Arguments:
3644      *
3645      * Inputs for multiplication:
3646      *   c_rarg0   - int array elements a
3647      *   c_rarg1   - int array elements b
3648      *   c_rarg2   - int array elements n (the modulus)
3649      *   c_rarg3   - int length
3650      *   c_rarg4   - int inv
3651      *   c_rarg5   - int array elements m (the result)
3652      *
3653      * Inputs for squaring:
3654      *   c_rarg0   - int array elements a
3655      *   c_rarg1   - int array elements n (the modulus)
3656      *   c_rarg2   - int length
3657      *   c_rarg3   - int inv
3658      *   c_rarg4   - int array elements m (the result)
3659      *
3660      */
3661     address generate_multiply() {
3662       Label argh, nothing;
3663       bind(argh);
3664       stop("MontgomeryMultiply total_allocation must be <= 8192");
3665 
3666       align(CodeEntryAlignment);
3667       address entry = pc();
3668 
3669       beqz(Rlen, nothing);
3670 
3671       enter();
3672 
3673       // Make room.
3674       mv(Ra, 512);
3675       bgt(Rlen, Ra, argh);
3676       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3677       sub(Ra, sp, Ra);
3678       andi(sp, Ra, -2 * wordSize);
3679 
3680       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3681 
3682       {
3683         // Copy input args, reversing as we go.  We use Ra as a
3684         // temporary variable.
3685         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3686         if (!_squaring)
3687           reverse(Ra, Pb_base, Rlen, Ri, Rj);
3688         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3689       }
3690 
3691       // Push all call-saved registers and also Pm_base which we'll need
3692       // at the end.
3693       save_regs();
3694 
3695 #ifndef PRODUCT
3696       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3697       {
3698         ld(Rn, Address(Pn_base));
3699         mul(Rlo_mn, Rn, inv);
3700         mv(t0, -1);
3701         Label ok;
3702         beq(Rlo_mn, t0, ok);
3703         stop("broken inverse in Montgomery multiply");
3704         bind(ok);
3705       }
3706 #endif
3707 
3708       mv(Pm_base, Ra);
3709 
3710       mv(tmp0, zr);
3711       mv(tmp1, zr);
3712       mv(tmp2, zr);
3713 
3714       block_comment("for (int i = 0; i < len; i++) {");
3715       mv(Ri, zr); {
3716         Label loop, end;
3717         bge(Ri, Rlen, end);
3718 
3719         bind(loop);
3720         pre1(Ri);
3721 
3722         block_comment("  for (j = i; j; j--) {"); {
3723           mv(Rj, Ri);
3724           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3725         } block_comment("  } // j");
3726 
3727         post1();
3728         addw(Ri, Ri, 1);
3729         blt(Ri, Rlen, loop);
3730         bind(end);
3731         block_comment("} // i");
3732       }
3733 
3734       block_comment("for (int i = len; i < 2*len; i++) {");
3735       mv(Ri, Rlen); {
3736         Label loop, end;
3737         slli(t0, Rlen, 1);
3738         bge(Ri, t0, end);
3739 
3740         bind(loop);
3741         pre2(Ri, Rlen);
3742 
3743         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3744           slliw(Rj, Rlen, 1);
3745           subw(Rj, Rj, Ri);
3746           subw(Rj, Rj, 1);
3747           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3748         } block_comment("  } // j");
3749 
3750         post2(Ri, Rlen);
3751         addw(Ri, Ri, 1);
3752         slli(t0, Rlen, 1);
3753         blt(Ri, t0, loop);
3754         bind(end);
3755       }
3756       block_comment("} // i");
3757 
3758       normalize(Rlen);
3759 
3760       mv(Ra, Pm_base);  // Save Pm_base in Ra
3761       restore_regs();  // Restore caller's Pm_base
3762 
3763       // Copy our result into caller's Pm_base
3764       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3765 
3766       leave();
3767       bind(nothing);
3768       ret();
3769 
3770       return entry;
3771     }
3772 
3773     /**
3774      *
3775      * Arguments:
3776      *
3777      * Inputs:
3778      *   c_rarg0   - int array elements a
3779      *   c_rarg1   - int array elements n (the modulus)
3780      *   c_rarg2   - int length
3781      *   c_rarg3   - int inv
3782      *   c_rarg4   - int array elements m (the result)
3783      *
3784      */
3785     address generate_square() {
3786       Label argh;
3787       bind(argh);
3788       stop("MontgomeryMultiply total_allocation must be <= 8192");
3789 
3790       align(CodeEntryAlignment);
3791       address entry = pc();
3792 
3793       enter();
3794 
3795       // Make room.
3796       mv(Ra, 512);
3797       bgt(Rlen, Ra, argh);
3798       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3799       sub(Ra, sp, Ra);
3800       andi(sp, Ra, -2 * wordSize);
3801 
3802       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3803 
3804       {
3805         // Copy input args, reversing as we go.  We use Ra as a
3806         // temporary variable.
3807         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3808         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3809       }
3810 
3811       // Push all call-saved registers and also Pm_base which we'll need
3812       // at the end.
3813       save_regs();
3814 
3815       mv(Pm_base, Ra);
3816 
3817       mv(tmp0, zr);
3818       mv(tmp1, zr);
3819       mv(tmp2, zr);
3820 
3821       block_comment("for (int i = 0; i < len; i++) {");
3822       mv(Ri, zr); {
3823         Label loop, end;
3824         bind(loop);
3825         bge(Ri, Rlen, end);
3826 
3827         pre1(Ri);
3828 
3829         block_comment("for (j = (i+1)/2; j; j--) {"); {
3830           addi(Rj, Ri, 1);
3831           srliw(Rj, Rj, 1);
3832           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3833         } block_comment("  } // j");
3834 
3835         last_squaring(Ri);
3836 
3837         block_comment("  for (j = i/2; j; j--) {"); {
3838           srliw(Rj, Ri, 1);
3839           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3840         } block_comment("  } // j");
3841 
3842         post1_squaring();
3843         addi(Ri, Ri, 1);
3844         blt(Ri, Rlen, loop);
3845 
3846         bind(end);
3847         block_comment("} // i");
3848       }
3849 
3850       block_comment("for (int i = len; i < 2*len; i++) {");
3851       mv(Ri, Rlen); {
3852         Label loop, end;
3853         bind(loop);
3854         slli(t0, Rlen, 1);
3855         bge(Ri, t0, end);
3856 
3857         pre2(Ri, Rlen);
3858 
3859         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
3860           slli(Rj, Rlen, 1);
3861           sub(Rj, Rj, Ri);
3862           sub(Rj, Rj, 1);
3863           srliw(Rj, Rj, 1);
3864           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3865         } block_comment("  } // j");
3866 
3867         last_squaring(Ri);
3868 
3869         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
3870           slli(Rj, Rlen, 1);
3871           sub(Rj, Rj, Ri);
3872           srliw(Rj, Rj, 1);
3873           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3874         } block_comment("  } // j");
3875 
3876         post2(Ri, Rlen);
3877         addi(Ri, Ri, 1);
3878         slli(t0, Rlen, 1);
3879         blt(Ri, t0, loop);
3880 
3881         bind(end);
3882         block_comment("} // i");
3883       }
3884 
3885       normalize(Rlen);
3886 
3887       mv(Ra, Pm_base);  // Save Pm_base in Ra
3888       restore_regs();  // Restore caller's Pm_base
3889 
3890       // Copy our result into caller's Pm_base
3891       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3892 
3893       leave();
3894       ret();
3895 
3896       return entry;
3897     }
3898   };
3899 
3900 #endif // COMPILER2
3901 
3902   address generate_cont_thaw(Continuation::thaw_kind kind) {
3903     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
3904     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
3905 
3906     address start = __ pc();
3907 
3908     if (return_barrier) {
3909       __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
3910     }
3911 
3912 #ifndef PRODUCT
3913     {
3914       Label OK;
3915       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
3916       __ beq(sp, t0, OK);
3917       __ stop("incorrect sp");
3918       __ bind(OK);
3919     }
3920 #endif
3921 
3922     if (return_barrier) {
3923       // preserve possible return value from a method returning to the return barrier
3924       __ sub(sp, sp, 2 * wordSize);
3925       __ fsd(f10, Address(sp, 0 * wordSize));
3926       __ sd(x10, Address(sp, 1 * wordSize));
3927     }
3928 
3929     __ mv(c_rarg1, (return_barrier ? 1 : 0));
3930     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
3931     __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
3932 
3933     if (return_barrier) {
3934       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
3935       __ ld(x10, Address(sp, 1 * wordSize));
3936       __ fld(f10, Address(sp, 0 * wordSize));
3937       __ add(sp, sp, 2 * wordSize);
3938     }
3939 
3940 #ifndef PRODUCT
3941     {
3942       Label OK;
3943       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
3944       __ beq(sp, t0, OK);
3945       __ stop("incorrect sp");
3946       __ bind(OK);
3947     }
3948 #endif
3949 
3950     Label thaw_success;
3951     // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
3952     __ bnez(t1, thaw_success);
3953     __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
3954     __ bind(thaw_success);
3955 
3956     // make room for the thawed frames
3957     __ sub(t0, sp, t1);
3958     __ andi(sp, t0, -16); // align
3959 
3960     if (return_barrier) {
3961       // save original return value -- again
3962       __ sub(sp, sp, 2 * wordSize);
3963       __ fsd(f10, Address(sp, 0 * wordSize));
3964       __ sd(x10, Address(sp, 1 * wordSize));
3965     }
3966 
3967     // If we want, we can templatize thaw by kind, and have three different entries
3968     __ mv(c_rarg1, kind);
3969 
3970     __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
3971     __ mv(t1, x10); // x10 is the sp of the yielding frame
3972 
3973     if (return_barrier) {
3974       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
3975       __ ld(x10, Address(sp, 1 * wordSize));
3976       __ fld(f10, Address(sp, 0 * wordSize));
3977       __ add(sp, sp, 2 * wordSize);
3978     } else {
3979       __ mv(x10, zr); // return 0 (success) from doYield
3980     }
3981 
3982     // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
3983     __ mv(fp, t1);
3984     __ sub(sp, t1, 2 * wordSize); // now pointing to fp spill
3985 
3986     if (return_barrier_exception) {
3987       __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
3988       __ verify_oop(x10);
3989       __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
3990 
3991       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
3992 
3993       // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
3994 
3995       __ mv(x11, x10); // the exception handler
3996       __ mv(x10, x9); // restore return value contaning the exception oop
3997       __ verify_oop(x10);
3998 
3999       __ leave();
4000       __ mv(x13, ra);
4001       __ jr(x11); // the exception handler
4002     } else {
4003       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4004       __ leave();
4005       __ ret();
4006     }
4007 
4008     return start;
4009   }
4010 
4011   address generate_cont_thaw() {
4012     if (!Continuations::enabled()) return nullptr;
4013 
4014     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
4015     address start = __ pc();
4016     generate_cont_thaw(Continuation::thaw_top);
4017     return start;
4018   }
4019 
4020   address generate_cont_returnBarrier() {
4021     if (!Continuations::enabled()) return nullptr;
4022 
4023     // TODO: will probably need multiple return barriers depending on return type
4024     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
4025     address start = __ pc();
4026 
4027     generate_cont_thaw(Continuation::thaw_return_barrier);
4028 
4029     return start;
4030   }
4031 
4032   address generate_cont_returnBarrier_exception() {
4033     if (!Continuations::enabled()) return nullptr;
4034 
4035     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
4036     address start = __ pc();
4037 
4038     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
4039 
4040     return start;
4041   }
4042 






























4043 #if COMPILER2_OR_JVMCI
4044 
4045 #undef __
4046 #define __ this->
4047 
4048   class Sha2Generator : public MacroAssembler {
4049     StubCodeGenerator* _cgen;
4050    public:
4051       Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
4052       address generate_sha256_implCompress(bool multi_block) {
4053         return generate_sha2_implCompress(Assembler::e32, multi_block);
4054       }
4055       address generate_sha512_implCompress(bool multi_block) {
4056         return generate_sha2_implCompress(Assembler::e64, multi_block);
4057       }
4058    private:
4059 
4060     void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4061       if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
4062       else                            __ vle64_v(vr, sr);
4063     }
4064 
4065     void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4066       if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
4067       else                            __ vse64_v(vr, sr);
4068     }
4069 
4070     // Overview of the logic in each "quad round".
4071     //
4072     // The code below repeats 16/20 times the logic implementing four rounds
4073     // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
4074     // to implementing the 64/80 single rounds.
4075     //
4076     //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
4077     //    // Output:
4078     //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4079     //    vl1reXX.v vTmp1, ofs
4080     //
4081     //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
4082     //    addi ofs, ofs, 16/32
4083     //
4084     //    // Add constants to message schedule words:
4085     //    //  Input
4086     //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4087     //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
4088     //    //  Output
4089     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4090     //    vadd.vv vTmp0, vTmp1, vW0
4091     //
4092     //    //  2 rounds of working variables updates.
4093     //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
4094     //    //  Input:
4095     //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
4096     //    //    vState0 = {a[t],b[t],e[t],f[t]}
4097     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4098     //    //  Output:
4099     //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
4100     //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
4101     //    vsha2cl.vv vState1, vState0, vTmp0
4102     //
4103     //    //  2 rounds of working variables updates.
4104     //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
4105     //    //  Input
4106     //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
4107     //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
4108     //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
4109     //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4110     //    //  Output:
4111     //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
4112     //    vsha2ch.vv vState0, vState1, vTmp0
4113     //
4114     //    // Combine 2QW into 1QW
4115     //    //
4116     //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
4117     //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
4118     //    // and it can only take 3 vectors as inputs. Hence we need to combine
4119     //    // vW1[0] and vW2[1..3] in a single vector.
4120     //    //
4121     //    // vmerge Vt4, Vt1, Vt2, V0
4122     //    // Input
4123     //    //  V0 = mask // first word from vW2, 1..3 words from vW1
4124     //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
4125     //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
4126     //    // Output
4127     //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
4128     //    vmerge.vvm vTmp0, vW2, vW1, v0
4129     //
4130     //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
4131     //    // Input
4132     //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
4133     //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
4134     //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
4135     //    // Output (next four message schedule words)
4136     //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
4137     //    vsha2ms.vv vW0, vTmp0, vW3
4138     //
4139     // BEFORE
4140     //  vW0 - vW3 hold the message schedule words (initially the block words)
4141     //    vW0 = W[ 3: 0]   "oldest"
4142     //    vW1 = W[ 7: 4]
4143     //    vW2 = W[11: 8]
4144     //    vW3 = W[15:12]   "newest"
4145     //
4146     //  vt6 - vt7 hold the working state variables
4147     //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
4148     //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
4149     //
4150     // AFTER
4151     //  vW0 - vW3 hold the message schedule words (initially the block words)
4152     //    vW1 = W[ 7: 4]   "oldest"
4153     //    vW2 = W[11: 8]
4154     //    vW3 = W[15:12]
4155     //    vW0 = W[19:16]   "newest"
4156     //
4157     //  vState0 and vState1 hold the working state variables
4158     //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4159     //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4160     //
4161     //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4162     //  hence the uses of those vectors rotate in each round, and we get back to the
4163     //  initial configuration every 4 quad-rounds. We could avoid those changes at
4164     //  the cost of moving those vectors at the end of each quad-rounds.
4165     void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4166                          Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4167                          bool gen_words = true, bool step_const = true) {
4168       __ vleXX_v(vset_sew, vtemp, scalarconst);
4169       if (step_const) {
4170         __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4171       }
4172       __ vadd_vv(vtemp2, vtemp, rot1);
4173       __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4174       __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4175       if (gen_words) {
4176         __ vmerge_vvm(vtemp2, rot3, rot2);
4177         __ vsha2ms_vv(rot1, vtemp2, rot4);
4178       }
4179     }
4180 
4181     const char* stub_name(Assembler::SEW vset_sew, bool multi_block) {
4182       if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress";
4183       if (vset_sew == Assembler::e32 &&  multi_block) return "sha256_implCompressMB";
4184       if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress";
4185       if (vset_sew == Assembler::e64 &&  multi_block) return "sha512_implCompressMB";
4186       ShouldNotReachHere();
4187       return "bad name lookup";
4188     }
4189 
4190     // Arguments:
4191     //
4192     // Inputs:
4193     //   c_rarg0   - byte[]  source+offset
4194     //   c_rarg1   - int[]   SHA.state
4195     //   c_rarg2   - int     offset
4196     //   c_rarg3   - int     limit
4197     //
4198     address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) {
4199       alignas(64) static const uint32_t round_consts_256[64] = {
4200         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4201         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4202         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4203         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4204         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4205         0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4206         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4207         0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4208         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4209         0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4210         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4211         0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4212         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4213         0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4214         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4215         0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4216       };
4217       alignas(64) static const uint64_t round_consts_512[80] = {
4218         0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4219         0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4220         0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4221         0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4222         0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4223         0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4224         0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4225         0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4226         0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4227         0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4228         0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4229         0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4230         0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4231         0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4232         0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4233         0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4234         0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4235         0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4236         0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4237         0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4238         0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4239         0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4240         0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4241         0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4242         0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4243         0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4244         0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4245       };
4246       const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4247 
4248       __ align(CodeEntryAlignment);
4249       StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block));
4250       address start = __ pc();
4251 
4252       Register buf   = c_rarg0;
4253       Register state = c_rarg1;
4254       Register ofs   = c_rarg2;
4255       Register limit = c_rarg3;
4256       Register consts =  t2; // caller saved
4257       Register state_c = x28; // caller saved
4258       VectorRegister vindex = v2;
4259       VectorRegister vW0 = v4;
4260       VectorRegister vW1 = v6;
4261       VectorRegister vW2 = v8;
4262       VectorRegister vW3 = v10;
4263       VectorRegister vState0 = v12;
4264       VectorRegister vState1 = v14;
4265       VectorRegister vHash0  = v16;
4266       VectorRegister vHash1  = v18;
4267       VectorRegister vTmp0   = v20;
4268       VectorRegister vTmp1   = v22;
4269 
4270       Label multi_block_loop;
4271 
4272       __ enter();
4273 
4274       address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
4275       la(consts, ExternalAddress(constant_table));
4276 
4277       // Register use in this function:
4278       //
4279       // VECTORS
4280       //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
4281       //             schedule words (Wt). They start with the message block
4282       //             content (W0 to W15), then further words in the message
4283       //             schedule generated via vsha2ms from previous Wt.
4284       //   Initially:
4285       //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
4286       //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
4287       //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
4288       //     vW3 = W[15:12] = {W15, W14, W13, W12}
4289       //
4290       //  vState0 - vState1 hold the working state variables (a, b, ..., h)
4291       //    vState0 = {f[t],e[t],b[t],a[t]}
4292       //    vState1 = {h[t],g[t],d[t],c[t]}
4293       //   Initially:
4294       //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
4295       //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
4296       //
4297       //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
4298       //
4299       //  vTmp0 = temporary, Wt+Kt
4300       //  vTmp1 = temporary, Kt
4301       //
4302       //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
4303       //
4304       // During most of the function the vector state is configured so that each
4305       // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
4306 
4307       // vsha2ch/vsha2cl uses EGW of 4*SEW.
4308       // SHA256 SEW = e32, EGW = 128-bits
4309       // SHA512 SEW = e64, EGW = 256-bits
4310       //
4311       // VLEN is required to be at least 128.
4312       // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
4313       //
4314       // m1: LMUL=1/2
4315       // ta: tail agnostic (don't care about those lanes)
4316       // ma: mask agnostic (don't care about those lanes)
4317       // x0 is not written, we known the number of vector elements.
4318 
4319       if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
4320         __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
4321       } else {
4322         __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
4323       }
4324 
4325       int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
4326       __ li(t0, indexes);
4327       __ vmv_v_x(vindex, t0);
4328 
4329       // Step-over a,b, so we are pointing to c.
4330       // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
4331       __ addi(state_c, state, const_add/2);
4332 
4333       // Use index-load to get {f,e,b,a},{h,g,d,c}
4334       __ vluxei8_v(vState0, state, vindex);
4335       __ vluxei8_v(vState1, state_c, vindex);
4336 
4337       __ bind(multi_block_loop);
4338 
4339       // Capture the initial H values in vHash0 and vHash1 to allow for computing
4340       // the resulting H', since H' = H+{a',b',c',...,h'}.
4341       __ vmv_v_v(vHash0, vState0);
4342       __ vmv_v_v(vHash1, vState1);
4343 
4344       // Load the 512/1024-bits of the message block in vW0-vW3 and perform
4345       // an endian swap on each 4/8 bytes element.
4346       //
4347       // If Zvkb is not implemented one can use vrgather
4348       // with an index sequence to byte-swap.
4349       //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
4350       //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
4351       //  this sequence. 'vid' gives us the N.
4352       __ vleXX_v(vset_sew, vW0, buf);
4353       __ vrev8_v(vW0, vW0);
4354       __ addi(buf, buf, const_add);
4355       __ vleXX_v(vset_sew, vW1, buf);
4356       __ vrev8_v(vW1, vW1);
4357       __ addi(buf, buf, const_add);
4358       __ vleXX_v(vset_sew, vW2, buf);
4359       __ vrev8_v(vW2, vW2);
4360       __ addi(buf, buf, const_add);
4361       __ vleXX_v(vset_sew, vW3, buf);
4362       __ vrev8_v(vW3, vW3);
4363       __ addi(buf, buf, const_add);
4364 
4365       // Set v0 up for the vmerge that replaces the first word (idx==0)
4366       __ vid_v(v0);
4367       __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
4368 
4369       VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
4370       int rot_pos = 0;
4371       // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
4372       const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
4373       for (int i = 0; i < qr_end; i++) {
4374         sha2_quad_round(vset_sew,
4375                    rotation_regs[(rot_pos + 0) & 0x3],
4376                    rotation_regs[(rot_pos + 1) & 0x3],
4377                    rotation_regs[(rot_pos + 2) & 0x3],
4378                    rotation_regs[(rot_pos + 3) & 0x3],
4379                    consts,
4380                    vTmp1, vTmp0, vState0, vState1);
4381         ++rot_pos;
4382       }
4383       // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
4384       // Note that we stop generating new message schedule words (Wt, vW0-13)
4385       // as we already generated all the words we end up consuming (i.e., W[63:60]).
4386       const int qr_c_end = qr_end + 4;
4387       for (int i = qr_end; i < qr_c_end; i++) {
4388         sha2_quad_round(vset_sew,
4389                    rotation_regs[(rot_pos + 0) & 0x3],
4390                    rotation_regs[(rot_pos + 1) & 0x3],
4391                    rotation_regs[(rot_pos + 2) & 0x3],
4392                    rotation_regs[(rot_pos + 3) & 0x3],
4393                    consts,
4394                    vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
4395         ++rot_pos;
4396       }
4397 
4398       //--------------------------------------------------------------------------------
4399       // Compute the updated hash value H'
4400       //   H' = H + {h',g',...,b',a'}
4401       //      = {h,g,...,b,a} + {h',g',...,b',a'}
4402       //      = {h+h',g+g',...,b+b',a+a'}
4403 
4404       // H' = H+{a',b',c',...,h'}
4405       __ vadd_vv(vState0, vHash0, vState0);
4406       __ vadd_vv(vState1, vHash1, vState1);
4407 
4408       if (multi_block) {
4409         int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
4410         __ addi(consts, consts, -total_adds);
4411         __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
4412         __ ble(ofs, limit, multi_block_loop);
4413         __ mv(c_rarg0, ofs); // return ofs
4414       }
4415 
4416       // Store H[0..8] = {a,b,c,d,e,f,g,h} from
4417       //  vState0 = {f,e,b,a}
4418       //  vState1 = {h,g,d,c}
4419       __ vsuxei8_v(vState0, state,   vindex);
4420       __ vsuxei8_v(vState1, state_c, vindex);
4421 
4422       __ leave();
4423       __ ret();
4424 
4425       return start;
4426     }
4427   };
4428 
4429 #undef __
4430 #define __ _masm->
4431 
4432   // Set of L registers that correspond to a contiguous memory area.
4433   // Each 64-bit register typically corresponds to 2 32-bit integers.
4434   template <uint L>
4435   class RegCache {
4436   private:
4437     MacroAssembler *_masm;
4438     Register _regs[L];
4439 
4440   public:
4441     RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
4442       assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
4443       auto it = rs.begin();
4444       for (auto &r: _regs) {
4445         r = *it;
4446         ++it;
4447       }
4448     }
4449 
4450     // generate load for the i'th register
4451     void gen_load(uint i, Register base) {
4452       assert(i < L, "invalid i: %u", i);
4453       __ ld(_regs[i], Address(base, 8 * i));
4454     }
4455 
4456     // add i'th 32-bit integer to dest
4457     void add_u32(const Register dest, uint i, const Register rtmp = t0) {
4458       assert(i < 2 * L, "invalid i: %u", i);
4459 
4460       if (is_even(i)) {
4461         // Use the bottom 32 bits. No need to mask off the top 32 bits
4462         // as addw will do the right thing.
4463         __ addw(dest, dest, _regs[i / 2]);
4464       } else {
4465         // Use the top 32 bits by right-shifting them.
4466         __ srli(rtmp, _regs[i / 2], 32);
4467         __ addw(dest, dest, rtmp);
4468       }
4469     }
4470   };
4471 
4472   typedef RegCache<8> BufRegCache;
4473 
4474   // a += value + x + ac;
4475   // a = Integer.rotateLeft(a, s) + b;
4476   void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
4477                                Register a, Register b, Register c, Register d,
4478                                int k, int s, int t,
4479                                Register value) {
4480     // a += ac
4481     __ addw(a, a, t, t1);
4482 
4483     // a += x;
4484     reg_cache.add_u32(a, k);
4485     // a += value;
4486     __ addw(a, a, value);
4487 
4488     // a = Integer.rotateLeft(a, s) + b;
4489     __ rolw_imm(a, a, s);
4490     __ addw(a, a, b);
4491   }
4492 
4493   // a += ((b & c) | ((~b) & d)) + x + ac;
4494   // a = Integer.rotateLeft(a, s) + b;
4495   void md5_FF(BufRegCache& reg_cache,
4496               Register a, Register b, Register c, Register d,
4497               int k, int s, int t,
4498               Register rtmp1, Register rtmp2) {
4499     // rtmp1 = b & c
4500     __ andr(rtmp1, b, c);
4501 
4502     // rtmp2 = (~b) & d
4503     __ andn(rtmp2, d, b);
4504 
4505     // rtmp1 = (b & c) | ((~b) & d)
4506     __ orr(rtmp1, rtmp1, rtmp2);
4507 
4508     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4509   }
4510 
4511   // a += ((b & d) | (c & (~d))) + x + ac;
4512   // a = Integer.rotateLeft(a, s) + b;
4513   void md5_GG(BufRegCache& reg_cache,
4514               Register a, Register b, Register c, Register d,
4515               int k, int s, int t,
4516               Register rtmp1, Register rtmp2) {
4517     // rtmp1 = b & d
4518     __ andr(rtmp1, b, d);
4519 
4520     // rtmp2 = c & (~d)
4521     __ andn(rtmp2, c, d);
4522 
4523     // rtmp1 = (b & d) | (c & (~d))
4524     __ orr(rtmp1, rtmp1, rtmp2);
4525 
4526     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4527   }
4528 
4529   // a += ((b ^ c) ^ d) + x + ac;
4530   // a = Integer.rotateLeft(a, s) + b;
4531   void md5_HH(BufRegCache& reg_cache,
4532               Register a, Register b, Register c, Register d,
4533               int k, int s, int t,
4534               Register rtmp1, Register rtmp2) {
4535     // rtmp1 = (b ^ c) ^ d
4536     __ xorr(rtmp2, b, c);
4537     __ xorr(rtmp1, rtmp2, d);
4538 
4539     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4540   }
4541 
4542   // a += (c ^ (b | (~d))) + x + ac;
4543   // a = Integer.rotateLeft(a, s) + b;
4544   void md5_II(BufRegCache& reg_cache,
4545               Register a, Register b, Register c, Register d,
4546               int k, int s, int t,
4547               Register rtmp1, Register rtmp2) {
4548     // rtmp1 = c ^ (b | (~d))
4549     __ orn(rtmp2, b, d);
4550     __ xorr(rtmp1, c, rtmp2);
4551 
4552     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4553   }
4554 
4555   // Arguments:
4556   //
4557   // Inputs:
4558   //   c_rarg0   - byte[]  source+offset
4559   //   c_rarg1   - int[]   SHA.state
4560   //   c_rarg2   - int     offset  (multi_block == True)
4561   //   c_rarg3   - int     limit   (multi_block == True)
4562   //
4563   // Registers:
4564   //    x0   zero  (zero)
4565   //    x1     ra  (return address)
4566   //    x2     sp  (stack pointer)
4567   //    x3     gp  (global pointer)
4568   //    x4     tp  (thread pointer)
4569   //    x5     t0  (tmp register)
4570   //    x6     t1  (tmp register)
4571   //    x7     t2  state0
4572   //    x8  f0/s0  (frame pointer)
4573   //    x9     s1
4574   //   x10     a0  rtmp1 / c_rarg0
4575   //   x11     a1  rtmp2 / c_rarg1
4576   //   x12     a2  a     / c_rarg2
4577   //   x13     a3  b     / c_rarg3
4578   //   x14     a4  c
4579   //   x15     a5  d
4580   //   x16     a6  buf
4581   //   x17     a7  state
4582   //   x18     s2  ofs     [saved-reg]  (multi_block == True)
4583   //   x19     s3  limit   [saved-reg]  (multi_block == True)
4584   //   x20     s4  state1  [saved-reg]
4585   //   x21     s5  state2  [saved-reg]
4586   //   x22     s6  state3  [saved-reg]
4587   //   x23     s7
4588   //   x24     s8  buf0    [saved-reg]
4589   //   x25     s9  buf1    [saved-reg]
4590   //   x26    s10  buf2    [saved-reg]
4591   //   x27    s11  buf3    [saved-reg]
4592   //   x28     t3  buf4
4593   //   x29     t4  buf5
4594   //   x30     t5  buf6
4595   //   x31     t6  buf7
4596   address generate_md5_implCompress(bool multi_block, const char *name) {
4597     __ align(CodeEntryAlignment);
4598     StubCodeMark mark(this, "StubRoutines", name);
4599     address start = __ pc();
4600 
4601     // rotation constants
4602     const int S11 = 7;
4603     const int S12 = 12;
4604     const int S13 = 17;
4605     const int S14 = 22;
4606     const int S21 = 5;
4607     const int S22 = 9;
4608     const int S23 = 14;
4609     const int S24 = 20;
4610     const int S31 = 4;
4611     const int S32 = 11;
4612     const int S33 = 16;
4613     const int S34 = 23;
4614     const int S41 = 6;
4615     const int S42 = 10;
4616     const int S43 = 15;
4617     const int S44 = 21;
4618 
4619     const int64_t mask32 = 0xffffffff;
4620 
4621     Register buf_arg   = c_rarg0; // a0
4622     Register state_arg = c_rarg1; // a1
4623     Register ofs_arg   = c_rarg2; // a2
4624     Register limit_arg = c_rarg3; // a3
4625 
4626     // we'll copy the args to these registers to free up a0-a3
4627     // to use for other values manipulated by instructions
4628     // that can be compressed
4629     Register buf       = x16; // a6
4630     Register state     = x17; // a7
4631     Register ofs       = x18; // s2
4632     Register limit     = x19; // s3
4633 
4634     // using x12->15 to allow compressed instructions
4635     Register a         = x12; // a2
4636     Register b         = x13; // a3
4637     Register c         = x14; // a4
4638     Register d         = x15; // a5
4639 
4640     Register state0    =  x7; // t2
4641     Register state1    = x20; // s4
4642     Register state2    = x21; // s5
4643     Register state3    = x22; // s6
4644 
4645     // using x10->x11 to allow compressed instructions
4646     Register rtmp1     = x10; // a0
4647     Register rtmp2     = x11; // a1
4648 
4649     RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
4650     RegSet reg_cache_regs;
4651     reg_cache_regs += reg_cache_saved_regs;
4652     reg_cache_regs += RegSet::of(t3, t4, t5, t6);
4653     BufRegCache reg_cache(_masm, reg_cache_regs);
4654 
4655     RegSet saved_regs;
4656     if (multi_block) {
4657       saved_regs += RegSet::of(ofs, limit);
4658     }
4659     saved_regs += RegSet::of(state1, state2, state3);
4660     saved_regs += reg_cache_saved_regs;
4661 
4662     __ push_reg(saved_regs, sp);
4663 
4664     __ mv(buf, buf_arg);
4665     __ mv(state, state_arg);
4666     if (multi_block) {
4667       __ mv(ofs, ofs_arg);
4668       __ mv(limit, limit_arg);
4669     }
4670 
4671     // to minimize the number of memory operations:
4672     // read the 4 state 4-byte values in pairs, with a single ld,
4673     // and split them into 2 registers.
4674     //
4675     // And, as the core algorithm of md5 works on 32-bits words, so
4676     // in the following code, it does not care about the content of
4677     // higher 32-bits in state[x]. Based on this observation,
4678     // we can apply further optimization, which is to just ignore the
4679     // higher 32-bits in state0/state2, rather than set the higher
4680     // 32-bits of state0/state2 to zero explicitly with extra instructions.
4681     __ ld(state0, Address(state));
4682     __ srli(state1, state0, 32);
4683     __ ld(state2, Address(state, 8));
4684     __ srli(state3, state2, 32);
4685 
4686     Label md5_loop;
4687     __ BIND(md5_loop);
4688 
4689     __ mv(a, state0);
4690     __ mv(b, state1);
4691     __ mv(c, state2);
4692     __ mv(d, state3);
4693 
4694     // Round 1
4695     reg_cache.gen_load(0, buf);
4696     md5_FF(reg_cache, a, b, c, d,  0, S11, 0xd76aa478, rtmp1, rtmp2);
4697     md5_FF(reg_cache, d, a, b, c,  1, S12, 0xe8c7b756, rtmp1, rtmp2);
4698     reg_cache.gen_load(1, buf);
4699     md5_FF(reg_cache, c, d, a, b,  2, S13, 0x242070db, rtmp1, rtmp2);
4700     md5_FF(reg_cache, b, c, d, a,  3, S14, 0xc1bdceee, rtmp1, rtmp2);
4701     reg_cache.gen_load(2, buf);
4702     md5_FF(reg_cache, a, b, c, d,  4, S11, 0xf57c0faf, rtmp1, rtmp2);
4703     md5_FF(reg_cache, d, a, b, c,  5, S12, 0x4787c62a, rtmp1, rtmp2);
4704     reg_cache.gen_load(3, buf);
4705     md5_FF(reg_cache, c, d, a, b,  6, S13, 0xa8304613, rtmp1, rtmp2);
4706     md5_FF(reg_cache, b, c, d, a,  7, S14, 0xfd469501, rtmp1, rtmp2);
4707     reg_cache.gen_load(4, buf);
4708     md5_FF(reg_cache, a, b, c, d,  8, S11, 0x698098d8, rtmp1, rtmp2);
4709     md5_FF(reg_cache, d, a, b, c,  9, S12, 0x8b44f7af, rtmp1, rtmp2);
4710     reg_cache.gen_load(5, buf);
4711     md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
4712     md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
4713     reg_cache.gen_load(6, buf);
4714     md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
4715     md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
4716     reg_cache.gen_load(7, buf);
4717     md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
4718     md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
4719 
4720     // Round 2
4721     md5_GG(reg_cache, a, b, c, d,  1, S21, 0xf61e2562, rtmp1, rtmp2);
4722     md5_GG(reg_cache, d, a, b, c,  6, S22, 0xc040b340, rtmp1, rtmp2);
4723     md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
4724     md5_GG(reg_cache, b, c, d, a,  0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
4725     md5_GG(reg_cache, a, b, c, d,  5, S21, 0xd62f105d, rtmp1, rtmp2);
4726     md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
4727     md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
4728     md5_GG(reg_cache, b, c, d, a,  4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
4729     md5_GG(reg_cache, a, b, c, d,  9, S21, 0x21e1cde6, rtmp1, rtmp2);
4730     md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
4731     md5_GG(reg_cache, c, d, a, b,  3, S23, 0xf4d50d87, rtmp1, rtmp2);
4732     md5_GG(reg_cache, b, c, d, a,  8, S24, 0x455a14ed, rtmp1, rtmp2);
4733     md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
4734     md5_GG(reg_cache, d, a, b, c,  2, S22, 0xfcefa3f8, rtmp1, rtmp2);
4735     md5_GG(reg_cache, c, d, a, b,  7, S23, 0x676f02d9, rtmp1, rtmp2);
4736     md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
4737 
4738     // Round 3
4739     md5_HH(reg_cache, a, b, c, d,  5, S31, 0xfffa3942, rtmp1, rtmp2);
4740     md5_HH(reg_cache, d, a, b, c,  8, S32, 0x8771f681, rtmp1, rtmp2);
4741     md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
4742     md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
4743     md5_HH(reg_cache, a, b, c, d,  1, S31, 0xa4beea44, rtmp1, rtmp2);
4744     md5_HH(reg_cache, d, a, b, c,  4, S32, 0x4bdecfa9, rtmp1, rtmp2);
4745     md5_HH(reg_cache, c, d, a, b,  7, S33, 0xf6bb4b60, rtmp1, rtmp2);
4746     md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
4747     md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
4748     md5_HH(reg_cache, d, a, b, c,  0, S32, 0xeaa127fa, rtmp1, rtmp2);
4749     md5_HH(reg_cache, c, d, a, b,  3, S33, 0xd4ef3085, rtmp1, rtmp2);
4750     md5_HH(reg_cache, b, c, d, a,  6, S34, 0x04881d05, rtmp1, rtmp2);
4751     md5_HH(reg_cache, a, b, c, d,  9, S31, 0xd9d4d039, rtmp1, rtmp2);
4752     md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
4753     md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
4754     md5_HH(reg_cache, b, c, d, a,  2, S34, 0xc4ac5665, rtmp1, rtmp2);
4755 
4756     // Round 4
4757     md5_II(reg_cache, a, b, c, d,  0, S41, 0xf4292244, rtmp1, rtmp2);
4758     md5_II(reg_cache, d, a, b, c,  7, S42, 0x432aff97, rtmp1, rtmp2);
4759     md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
4760     md5_II(reg_cache, b, c, d, a,  5, S44, 0xfc93a039, rtmp1, rtmp2);
4761     md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
4762     md5_II(reg_cache, d, a, b, c,  3, S42, 0x8f0ccc92, rtmp1, rtmp2);
4763     md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
4764     md5_II(reg_cache, b, c, d, a,  1, S44, 0x85845dd1, rtmp1, rtmp2);
4765     md5_II(reg_cache, a, b, c, d,  8, S41, 0x6fa87e4f, rtmp1, rtmp2);
4766     md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
4767     md5_II(reg_cache, c, d, a, b,  6, S43, 0xa3014314, rtmp1, rtmp2);
4768     md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
4769     md5_II(reg_cache, a, b, c, d,  4, S41, 0xf7537e82, rtmp1, rtmp2);
4770     md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
4771     md5_II(reg_cache, c, d, a, b,  2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
4772     md5_II(reg_cache, b, c, d, a,  9, S44, 0xeb86d391, rtmp1, rtmp2);
4773 
4774     __ addw(state0, state0, a);
4775     __ addw(state1, state1, b);
4776     __ addw(state2, state2, c);
4777     __ addw(state3, state3, d);
4778 
4779     if (multi_block) {
4780       __ addi(buf, buf, 64);
4781       __ addi(ofs, ofs, 64);
4782       // if (ofs <= limit) goto m5_loop
4783       __ bge(limit, ofs, md5_loop);
4784       __ mv(c_rarg0, ofs); // return ofs
4785     }
4786 
4787     // to minimize the number of memory operations:
4788     // write back the 4 state 4-byte values in pairs, with a single sd
4789     __ mv(t0, mask32);
4790     __ andr(state0, state0, t0);
4791     __ slli(state1, state1, 32);
4792     __ orr(state0, state0, state1);
4793     __ sd(state0, Address(state));
4794     __ andr(state2, state2, t0);
4795     __ slli(state3, state3, 32);
4796     __ orr(state2, state2, state3);
4797     __ sd(state2, Address(state, 8));
4798 
4799     __ pop_reg(saved_regs, sp);
4800     __ ret();
4801 
4802     return (address) start;
4803   }
4804 
4805   /**
4806    * Perform the quarter round calculations on values contained within four vector registers.
4807    *
4808    * @param aVec the SIMD register containing only the "a" values
4809    * @param bVec the SIMD register containing only the "b" values
4810    * @param cVec the SIMD register containing only the "c" values
4811    * @param dVec the SIMD register containing only the "d" values
4812    * @param tmp_vr temporary vector register holds intermedia values.
4813    */
4814   void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
4815                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
4816     // a += b, d ^= a, d <<<= 16
4817     __ vadd_vv(aVec, aVec, bVec);
4818     __ vxor_vv(dVec, dVec, aVec);
4819     __ vrole32_vi(dVec, 16, tmp_vr);
4820 
4821     // c += d, b ^= c, b <<<= 12
4822     __ vadd_vv(cVec, cVec, dVec);
4823     __ vxor_vv(bVec, bVec, cVec);
4824     __ vrole32_vi(bVec, 12, tmp_vr);
4825 
4826     // a += b, d ^= a, d <<<= 8
4827     __ vadd_vv(aVec, aVec, bVec);
4828     __ vxor_vv(dVec, dVec, aVec);
4829     __ vrole32_vi(dVec, 8, tmp_vr);
4830 
4831     // c += d, b ^= c, b <<<= 7
4832     __ vadd_vv(cVec, cVec, dVec);
4833     __ vxor_vv(bVec, bVec, cVec);
4834     __ vrole32_vi(bVec, 7, tmp_vr);
4835   }
4836 
4837   /**
4838    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
4839    *
4840    *  Input arguments:
4841    *  c_rarg0   - state, the starting state
4842    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
4843    *
4844    *  Implementation Note:
4845    *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
4846    *   N depends on single vector register length.
4847    */
4848   address generate_chacha20Block() {
4849     Label L_Rounds;
4850 
4851     __ align(CodeEntryAlignment);
4852     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4853     address start = __ pc();
4854     __ enter();
4855 
4856     const int states_len = 16;
4857     const int step = 4;
4858     const Register state = c_rarg0;
4859     const Register key_stream = c_rarg1;
4860     const Register tmp_addr = t0;
4861     const Register length = t1;
4862 
4863     // Organize vector registers in an array that facilitates
4864     // putting repetitive opcodes into loop structures below.
4865     const VectorRegister work_vrs[16] = {
4866       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
4867       v8, v9, v10, v11, v12, v13, v14, v15
4868     };
4869     const VectorRegister tmp_vr = v16;
4870     const VectorRegister counter_vr = v17;
4871 
4872     {
4873       // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
4874       // in java level.
4875       __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
4876     }
4877 
4878     // Load from source state.
4879     // Every element in source state is duplicated to all elements in the corresponding vector.
4880     __ mv(tmp_addr, state);
4881     for (int i = 0; i < states_len; i += 1) {
4882       __ vlse32_v(work_vrs[i], tmp_addr, zr);
4883       __ addi(tmp_addr, tmp_addr, step);
4884     }
4885     // Adjust counter for every individual block.
4886     __ vid_v(counter_vr);
4887     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4888 
4889     // Perform 10 iterations of the 8 quarter round set
4890     {
4891       const Register loop = t2; // share t2 with other non-overlapping usages.
4892       __ mv(loop, 10);
4893       __ BIND(L_Rounds);
4894 
4895       chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
4896       chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
4897       chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
4898       chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
4899 
4900       chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
4901       chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
4902       chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
4903       chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
4904 
4905       __ sub(loop, loop, 1);
4906       __ bnez(loop, L_Rounds);
4907     }
4908 
4909     // Add the original state into the end working state.
4910     // We do this by first duplicating every element in source state array to the corresponding
4911     // vector, then adding it to the post-loop working state.
4912     __ mv(tmp_addr, state);
4913     for (int i = 0; i < states_len; i += 1) {
4914       __ vlse32_v(tmp_vr, tmp_addr, zr);
4915       __ addi(tmp_addr, tmp_addr, step);
4916       __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
4917     }
4918     // Add the counter overlay onto work_vrs[12] at the end.
4919     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4920 
4921     // Store result to key stream.
4922     {
4923       const Register stride = t2; // share t2 with other non-overlapping usages.
4924       // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
4925       __ mv(stride, 64);
4926       for (int i = 0; i < states_len; i += 1) {
4927         __ vsse32_v(work_vrs[i], key_stream, stride);
4928         __ addi(key_stream, key_stream, step);
4929       }
4930     }
4931 
4932     // Return length of output key_stream
4933     __ slli(c_rarg0, length, 6);
4934 
4935     __ leave();
4936     __ ret();
4937 
4938     return (address) start;
4939   }
4940 
4941 
4942   // ------------------------ SHA-1 intrinsic ------------------------
4943 
4944   // K't =
4945   //    5a827999, 0  <= t <= 19
4946   //    6ed9eba1, 20 <= t <= 39
4947   //    8f1bbcdc, 40 <= t <= 59
4948   //    ca62c1d6, 60 <= t <= 79
4949   void sha1_prepare_k(Register cur_k, int round) {
4950     assert(round >= 0 && round < 80, "must be");
4951 
4952     static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
4953     if ((round % 20) == 0) {
4954       __ mv(cur_k, ks[round/20]);
4955     }
4956   }
4957 
4958   // W't =
4959   //    M't,                                      0 <=  t <= 15
4960   //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4961   void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
4962     assert(round >= 0 && round < 80, "must be");
4963 
4964     if (round < 16) {
4965       // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
4966       //   in ws[0], high part contains W't-0, low part contains W't-1,
4967       //   in ws[1], high part contains W't-2, low part contains W't-3,
4968       //   ...
4969       //   in ws[7], high part contains W't-14, low part contains W't-15.
4970 
4971       if ((round % 2) == 0) {
4972         __ ld(ws[round/2], Address(buf, (round/2) * 8));
4973         // reverse bytes, as SHA-1 is defined in big-endian.
4974         __ revb(ws[round/2], ws[round/2]);
4975         __ srli(cur_w, ws[round/2], 32);
4976       } else {
4977         __ mv(cur_w, ws[round/2]);
4978       }
4979 
4980       return;
4981     }
4982 
4983     if ((round % 2) == 0) {
4984       int idx = 16;
4985       // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4986       __ srli(t1, ws[(idx-8)/2], 32);
4987       __ xorr(t0, ws[(idx-3)/2], t1);
4988 
4989       __ srli(t1, ws[(idx-14)/2], 32);
4990       __ srli(cur_w, ws[(idx-16)/2], 32);
4991       __ xorr(cur_w, cur_w, t1);
4992 
4993       __ xorr(cur_w, cur_w, t0);
4994       __ rolw_imm(cur_w, cur_w, 1, t0);
4995 
4996       // copy the cur_w value to ws[8].
4997       // now, valid w't values are at:
4998       //  w0:       ws[0]'s lower 32 bits
4999       //  w1 ~ w14: ws[1] ~ ws[7]
5000       //  w15:      ws[8]'s higher 32 bits
5001       __ slli(ws[idx/2], cur_w, 32);
5002 
5003       return;
5004     }
5005 
5006     int idx = 17;
5007     // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5008     __ srli(t1, ws[(idx-3)/2], 32);
5009     __ xorr(t0, t1, ws[(idx-8)/2]);
5010 
5011     __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
5012 
5013     __ xorr(cur_w, cur_w, t0);
5014     __ rolw_imm(cur_w, cur_w, 1, t0);
5015 
5016     // copy the cur_w value to ws[8]
5017     __ zero_extend(cur_w, cur_w, 32);
5018     __ orr(ws[idx/2], ws[idx/2], cur_w);
5019 
5020     // shift the w't registers, so they start from ws[0] again.
5021     // now, valid w't values are at:
5022     //  w0 ~ w15: ws[0] ~ ws[7]
5023     Register ws_0 = ws[0];
5024     for (int i = 0; i < 16/2; i++) {
5025       ws[i] = ws[i+1];
5026     }
5027     ws[8] = ws_0;
5028   }
5029 
5030   // f't(x, y, z) =
5031   //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
5032   //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
5033   //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
5034   //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
5035   void sha1_f(Register dst, Register x, Register y, Register z, int round) {
5036     assert(round >= 0 && round < 80, "must be");
5037     assert_different_registers(dst, x, y, z, t0, t1);
5038 
5039     if (round < 20) {
5040       // (x & y) ^ (~x & z)
5041       __ andr(t0, x, y);
5042       __ andn(dst, z, x);
5043       __ xorr(dst, dst, t0);
5044     } else if (round >= 40 && round < 60) {
5045       // (x & y) ^ (x & z) ^ (y & z)
5046       __ andr(t0, x, y);
5047       __ andr(t1, x, z);
5048       __ andr(dst, y, z);
5049       __ xorr(dst, dst, t0);
5050       __ xorr(dst, dst, t1);
5051     } else {
5052       // x ^ y ^ z
5053       __ xorr(dst, x, y);
5054       __ xorr(dst, dst, z);
5055     }
5056   }
5057 
5058   // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5059   // e = d
5060   // d = c
5061   // c = ROTL'30(b)
5062   // b = a
5063   // a = T
5064   void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
5065                           Register cur_k, Register cur_w, Register tmp, int round) {
5066     assert(round >= 0 && round < 80, "must be");
5067     assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
5068 
5069     // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5070 
5071     // cur_w will be recalculated at the beginning of each round,
5072     // so, we can reuse it as a temp register here.
5073     Register tmp2 = cur_w;
5074 
5075     // reuse e as a temporary register, as we will mv new value into it later
5076     Register tmp3 = e;
5077     __ add(tmp2, cur_k, tmp2);
5078     __ add(tmp3, tmp3, tmp2);
5079     __ rolw_imm(tmp2, a, 5, t0);
5080 
5081     sha1_f(tmp, b, c, d, round);
5082 
5083     __ add(tmp2, tmp2, tmp);
5084     __ add(tmp2, tmp2, tmp3);
5085 
5086     // e = d
5087     // d = c
5088     // c = ROTL'30(b)
5089     // b = a
5090     // a = T
5091     __ mv(e, d);
5092     __ mv(d, c);
5093 
5094     __ rolw_imm(c, b, 30);
5095     __ mv(b, a);
5096     __ mv(a, tmp2);
5097   }
5098 
5099   // H(i)0 = a + H(i-1)0
5100   // H(i)1 = b + H(i-1)1
5101   // H(i)2 = c + H(i-1)2
5102   // H(i)3 = d + H(i-1)3
5103   // H(i)4 = e + H(i-1)4
5104   void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
5105                               Register prev_ab, Register prev_cd, Register prev_e) {
5106     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5107 
5108     __ add(a, a, prev_ab);
5109     __ srli(prev_ab, prev_ab, 32);
5110     __ add(b, b, prev_ab);
5111 
5112     __ add(c, c, prev_cd);
5113     __ srli(prev_cd, prev_cd, 32);
5114     __ add(d, d, prev_cd);
5115 
5116     __ add(e, e, prev_e);
5117   }
5118 
5119   void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
5120                                 Register prev_ab, Register prev_cd, Register prev_e) {
5121     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
5122 
5123     __ slli(t0, b, 32);
5124     __ zero_extend(prev_ab, a, 32);
5125     __ orr(prev_ab, prev_ab, t0);
5126 
5127     __ slli(t0, d, 32);
5128     __ zero_extend(prev_cd, c, 32);
5129     __ orr(prev_cd, prev_cd, t0);
5130 
5131     __ mv(prev_e, e);
5132   }
5133 
5134   // Intrinsic for:
5135   //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5136   //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5137   //
5138   // Arguments:
5139   //
5140   // Inputs:
5141   //   c_rarg0: byte[]  src array + offset
5142   //   c_rarg1: int[]   SHA.state
5143   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5144   //   c_rarg2: int     offset
5145   //   c_rarg3: int     limit
5146   //
5147   // Outputs:
5148   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5149   //   c_rarg0: int offset, when (multi_block == true)
5150   //
5151   address generate_sha1_implCompress(bool multi_block, const char *name) {
5152     __ align(CodeEntryAlignment);
5153     StubCodeMark mark(this, "StubRoutines", name);
5154 
5155     address start = __ pc();
5156     __ enter();
5157 
5158     RegSet saved_regs = RegSet::range(x18, x27);
5159     if (multi_block) {
5160       // use x9 as src below.
5161       saved_regs += RegSet::of(x9);
5162     }
5163     __ push_reg(saved_regs, sp);
5164 
5165     // c_rarg0 - c_rarg3: x10 - x13
5166     Register buf    = c_rarg0;
5167     Register state  = c_rarg1;
5168     Register offset = c_rarg2;
5169     Register limit  = c_rarg3;
5170     // use src to contain the original start point of the array.
5171     Register src    = x9;
5172 
5173     if (multi_block) {
5174       __ sub(limit, limit, offset);
5175       __ add(limit, limit, buf);
5176       __ sub(src, buf, offset);
5177     }
5178 
5179     // [args-reg]:  x14 - x17
5180     // [temp-reg]:  x28 - x31
5181     // [saved-reg]: x18 - x27
5182 
5183     // h0/1/2/3/4
5184     const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5185     // w0, w1, ... w15
5186     // put two adjecent w's in one register:
5187     //    one at high word part, another at low word part
5188     // at different round (even or odd), w't value reside in different items in ws[].
5189     // w0 ~ w15, either reside in
5190     //    ws[0] ~ ws[7], where
5191     //      w0 at higher 32 bits of ws[0],
5192     //      w1 at lower 32 bits of ws[0],
5193     //      ...
5194     //      w14 at higher 32 bits of ws[7],
5195     //      w15 at lower 32 bits of ws[7].
5196     // or, reside in
5197     //    w0:       ws[0]'s lower 32 bits
5198     //    w1 ~ w14: ws[1] ~ ws[7]
5199     //    w15:      ws[8]'s higher 32 bits
5200     Register ws[9] = {x29, x30, x31, x18,
5201                       x19, x20, x21, x22,
5202                       x23}; // auxiliary register for calculating w's value
5203     // current k't's value
5204     const Register cur_k = x24;
5205     // current w't's value
5206     const Register cur_w = x25;
5207     // values of a, b, c, d, e in the previous round
5208     const Register prev_ab = x26, prev_cd = x27;
5209     const Register prev_e = offset; // reuse offset/c_rarg2
5210 
5211     // load 5 words state into a, b, c, d, e.
5212     //
5213     // To minimize the number of memory operations, we apply following
5214     // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5215     // with a single ld, and split them into 2 registers.
5216     //
5217     // And, as the core algorithm of SHA-1 works on 32-bits words, so
5218     // in the following code, it does not care about the content of
5219     // higher 32-bits in a/b/c/d/e. Based on this observation,
5220     // we can apply further optimization, which is to just ignore the
5221     // higher 32-bits in a/c/e, rather than set the higher
5222     // 32-bits of a/c/e to zero explicitly with extra instructions.
5223     __ ld(a, Address(state, 0));
5224     __ srli(b, a, 32);
5225     __ ld(c, Address(state, 8));
5226     __ srli(d, c, 32);
5227     __ lw(e, Address(state, 16));
5228 
5229     Label L_sha1_loop;
5230     if (multi_block) {
5231       __ BIND(L_sha1_loop);
5232     }
5233 
5234     sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5235 
5236     for (int round = 0; round < 80; round++) {
5237       // prepare K't value
5238       sha1_prepare_k(cur_k, round);
5239 
5240       // prepare W't value
5241       sha1_prepare_w(cur_w, ws, buf, round);
5242 
5243       // one round process
5244       sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
5245     }
5246 
5247     // compute the intermediate hash value
5248     sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5249 
5250     if (multi_block) {
5251       int64_t block_bytes = 16 * 4;
5252       __ addi(buf, buf, block_bytes);
5253 
5254       __ bge(limit, buf, L_sha1_loop, true);
5255     }
5256 
5257     // store back the state.
5258     __ zero_extend(a, a, 32);
5259     __ slli(b, b, 32);
5260     __ orr(a, a, b);
5261     __ sd(a, Address(state, 0));
5262     __ zero_extend(c, c, 32);
5263     __ slli(d, d, 32);
5264     __ orr(c, c, d);
5265     __ sd(c, Address(state, 8));
5266     __ sw(e, Address(state, 16));
5267 
5268     // return offset
5269     if (multi_block) {
5270       __ sub(c_rarg0, buf, src);
5271     }
5272 
5273     __ pop_reg(saved_regs, sp);
5274 
5275     __ leave();
5276     __ ret();
5277 
5278     return (address) start;
5279   }
5280 
5281   /**
5282    * vector registers:
5283    *   input VectorRegister's:  intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
5284    *   index VectorRegister's:  idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
5285    *   output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
5286    *
5287    * NOTE: each field will occupy a vector register group
5288    */
5289   void base64_vector_encode_round(Register src, Register dst, Register codec,
5290                     Register size, Register stepSrc, Register stepDst,
5291                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
5292                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5293                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
5294                     Assembler::LMUL lmul) {
5295     // set vector register type/len
5296     __ vsetvli(x0, size, Assembler::e8, lmul);
5297 
5298     // segmented load src into v registers: mem(src) => vr(3)
5299     __ vlseg3e8_v(inputV1, src);
5300 
5301     // src = src + register_group_len_bytes * 3
5302     __ add(src, src, stepSrc);
5303 
5304     // encoding
5305     //   1. compute index into lookup table: vr(3) => vr(4)
5306     __ vsrl_vi(idxV1, inputV1, 2);
5307 
5308     __ vsrl_vi(idxV2, inputV2, 2);
5309     __ vsll_vi(inputV1, inputV1, 6);
5310     __ vor_vv(idxV2, idxV2, inputV1);
5311     __ vsrl_vi(idxV2, idxV2, 2);
5312 
5313     __ vsrl_vi(idxV3, inputV3, 4);
5314     __ vsll_vi(inputV2, inputV2, 4);
5315     __ vor_vv(idxV3, inputV2, idxV3);
5316     __ vsrl_vi(idxV3, idxV3, 2);
5317 
5318     __ vsll_vi(idxV4, inputV3, 2);
5319     __ vsrl_vi(idxV4, idxV4, 2);
5320 
5321     //   2. indexed load: vr(4) => vr(4)
5322     __ vluxei8_v(outputV1, codec, idxV1);
5323     __ vluxei8_v(outputV2, codec, idxV2);
5324     __ vluxei8_v(outputV3, codec, idxV3);
5325     __ vluxei8_v(outputV4, codec, idxV4);
5326 
5327     // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
5328     __ vsseg4e8_v(outputV1, dst);
5329 
5330     // dst = dst + register_group_len_bytes * 4
5331     __ add(dst, dst, stepDst);
5332   }
5333 
5334   /**
5335    *  void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
5336    *
5337    *  Input arguments:
5338    *  c_rarg0   - src, source array
5339    *  c_rarg1   - sp, src start offset
5340    *  c_rarg2   - sl, src end offset
5341    *  c_rarg3   - dst, dest array
5342    *  c_rarg4   - dp, dst start offset
5343    *  c_rarg5   - isURL, Base64 or URL character set
5344    */
5345   address generate_base64_encodeBlock() {
5346     alignas(64) static const char toBase64[64] = {
5347       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5348       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5349       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5350       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5351       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5352     };
5353 
5354     alignas(64) static const char toBase64URL[64] = {
5355       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5356       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5357       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5358       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5359       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5360     };
5361 
5362     __ align(CodeEntryAlignment);
5363     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5364     address start = __ pc();
5365     __ enter();
5366 
5367     Register src    = c_rarg0;
5368     Register soff   = c_rarg1;
5369     Register send   = c_rarg2;
5370     Register dst    = c_rarg3;
5371     Register doff   = c_rarg4;
5372     Register isURL  = c_rarg5;
5373 
5374     Register codec  = c_rarg6;
5375     Register length = c_rarg7; // total length of src data in bytes
5376 
5377     Label ProcessData, Exit;
5378 
5379     // length should be multiple of 3
5380     __ sub(length, send, soff);
5381     // real src/dst to process data
5382     __ add(src, src, soff);
5383     __ add(dst, dst, doff);
5384 
5385     // load the codec base address
5386     __ la(codec, ExternalAddress((address) toBase64));
5387     __ beqz(isURL, ProcessData);
5388     __ la(codec, ExternalAddress((address) toBase64URL));
5389     __ BIND(ProcessData);
5390 
5391     // vector version
5392     if (UseRVV) {
5393       Label ProcessM2, ProcessM1, ProcessScalar;
5394 
5395       Register size      = soff;
5396       Register stepSrcM1 = send;
5397       Register stepSrcM2 = doff;
5398       Register stepDst   = isURL;
5399 
5400       __ mv(size, MaxVectorSize * 2);
5401       __ mv(stepSrcM1, MaxVectorSize * 3);
5402       __ slli(stepSrcM2, stepSrcM1, 1);
5403       __ mv(stepDst, MaxVectorSize * 2 * 4);
5404 
5405       __ blt(length, stepSrcM2, ProcessM1);
5406 
5407       __ BIND(ProcessM2);
5408       base64_vector_encode_round(src, dst, codec,
5409                     size, stepSrcM2, stepDst,
5410                     v2, v4, v6,         // inputs
5411                     v8, v10, v12, v14,  // indexes
5412                     v16, v18, v20, v22, // outputs
5413                     Assembler::m2);
5414 
5415       __ sub(length, length, stepSrcM2);
5416       __ bge(length, stepSrcM2, ProcessM2);
5417 
5418       __ BIND(ProcessM1);
5419       __ blt(length, stepSrcM1, ProcessScalar);
5420 
5421       __ srli(size, size, 1);
5422       __ srli(stepDst, stepDst, 1);
5423       base64_vector_encode_round(src, dst, codec,
5424                     size, stepSrcM1, stepDst,
5425                     v1, v2, v3,         // inputs
5426                     v4, v5, v6, v7,     // indexes
5427                     v8, v9, v10, v11,   // outputs
5428                     Assembler::m1);
5429       __ sub(length, length, stepSrcM1);
5430 
5431       __ BIND(ProcessScalar);
5432     }
5433 
5434     // scalar version
5435     {
5436       Register byte1 = soff, byte0 = send, byte2 = doff;
5437       Register combined24Bits = isURL;
5438 
5439       __ beqz(length, Exit);
5440 
5441       Label ScalarLoop;
5442       __ BIND(ScalarLoop);
5443       {
5444         // plain:   [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
5445         // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
5446 
5447         // load 3 bytes src data
5448         __ lbu(byte0, Address(src, 0));
5449         __ lbu(byte1, Address(src, 1));
5450         __ lbu(byte2, Address(src, 2));
5451         __ addi(src, src, 3);
5452 
5453         // construct 24 bits from 3 bytes
5454         __ slliw(byte0, byte0, 16);
5455         __ slliw(byte1, byte1, 8);
5456         __ orr(combined24Bits, byte0, byte1);
5457         __ orr(combined24Bits, combined24Bits, byte2);
5458 
5459         // get codec index and encode(ie. load from codec by index)
5460         __ slliw(byte0, combined24Bits, 8);
5461         __ srliw(byte0, byte0, 26);
5462         __ add(byte0, codec, byte0);
5463         __ lbu(byte0, byte0);
5464 
5465         __ slliw(byte1, combined24Bits, 14);
5466         __ srliw(byte1, byte1, 26);
5467         __ add(byte1, codec, byte1);
5468         __ lbu(byte1, byte1);
5469 
5470         __ slliw(byte2, combined24Bits, 20);
5471         __ srliw(byte2, byte2, 26);
5472         __ add(byte2, codec, byte2);
5473         __ lbu(byte2, byte2);
5474 
5475         __ andi(combined24Bits, combined24Bits, 0x3f);
5476         __ add(combined24Bits, codec, combined24Bits);
5477         __ lbu(combined24Bits, combined24Bits);
5478 
5479         // store 4 bytes encoded data
5480         __ sb(byte0, Address(dst, 0));
5481         __ sb(byte1, Address(dst, 1));
5482         __ sb(byte2, Address(dst, 2));
5483         __ sb(combined24Bits, Address(dst, 3));
5484 
5485         __ sub(length, length, 3);
5486         __ addi(dst, dst, 4);
5487         // loop back
5488         __ bnez(length, ScalarLoop);
5489       }
5490     }
5491 
5492     __ BIND(Exit);
5493 
5494     __ leave();
5495     __ ret();
5496 
5497     return (address) start;
5498   }
5499 
5500   /**
5501    * vector registers:
5502    * input VectorRegister's:  intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
5503    * index VectorRegister's:  idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
5504    * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
5505    *
5506    * NOTE: each field will occupy a single vector register group
5507    */
5508   void base64_vector_decode_round(Register src, Register dst, Register codec,
5509                     Register size, Register stepSrc, Register stepDst, Register failedIdx,
5510                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
5511                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5512                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
5513                     Assembler::LMUL lmul) {
5514     // set vector register type/len
5515     __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
5516 
5517     // segmented load src into v registers: mem(src) => vr(4)
5518     __ vlseg4e8_v(inputV1, src);
5519 
5520     // src = src + register_group_len_bytes * 4
5521     __ add(src, src, stepSrc);
5522 
5523     // decoding
5524     //   1. indexed load: vr(4) => vr(4)
5525     __ vluxei8_v(idxV1, codec, inputV1);
5526     __ vluxei8_v(idxV2, codec, inputV2);
5527     __ vluxei8_v(idxV3, codec, inputV3);
5528     __ vluxei8_v(idxV4, codec, inputV4);
5529 
5530     //   2. check wrong data
5531     __ vor_vv(outputV1, idxV1, idxV2);
5532     __ vor_vv(outputV2, idxV3, idxV4);
5533     __ vor_vv(outputV1, outputV1, outputV2);
5534     __ vmseq_vi(v0, outputV1, -1);
5535     __ vfirst_m(failedIdx, v0);
5536     Label NoFailure, FailureAtIdx0;
5537     // valid value can only be -1 when < 0
5538     __ bltz(failedIdx, NoFailure);
5539     // when the first data (at index 0) fails, no need to process data anymore
5540     __ beqz(failedIdx, FailureAtIdx0);
5541     __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
5542     __ slli(stepDst, failedIdx, 1);
5543     __ add(stepDst, failedIdx, stepDst);
5544     __ BIND(NoFailure);
5545 
5546     //   3. compute the decoded data: vr(4) => vr(3)
5547     __ vsll_vi(idxV1, idxV1, 2);
5548     __ vsrl_vi(outputV1, idxV2, 4);
5549     __ vor_vv(outputV1, outputV1, idxV1);
5550 
5551     __ vsll_vi(idxV2, idxV2, 4);
5552     __ vsrl_vi(outputV2, idxV3, 2);
5553     __ vor_vv(outputV2, outputV2, idxV2);
5554 
5555     __ vsll_vi(idxV3, idxV3, 6);
5556     __ vor_vv(outputV3, idxV4, idxV3);
5557 
5558     // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
5559     __ vsseg3e8_v(outputV1, dst);
5560 
5561     // dst = dst + register_group_len_bytes * 3
5562     __ add(dst, dst, stepDst);
5563     __ BIND(FailureAtIdx0);
5564   }
5565 
5566   /**
5567    * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
5568    *
5569    *  Input arguments:
5570    *  c_rarg0   - src, source array
5571    *  c_rarg1   - sp, src start offset
5572    *  c_rarg2   - sl, src end offset
5573    *  c_rarg3   - dst, dest array
5574    *  c_rarg4   - dp, dst start offset
5575    *  c_rarg5   - isURL, Base64 or URL character set
5576    *  c_rarg6   - isMIME, Decoding MIME block
5577    */
5578   address generate_base64_decodeBlock() {
5579 
5580     static const uint8_t fromBase64[256] = {
5581         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5582         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5583         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5584         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5585         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5586         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
5587         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5588         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5589         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5590         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5591         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5592         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5593         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5594         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5595         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5596         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5597     };
5598 
5599     static const uint8_t fromBase64URL[256] = {
5600         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5601         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5602         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5603         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5604         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5605         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
5606         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5607         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5608         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5609         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5610         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5611         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5612         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5613         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5614         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5615         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5616     };
5617 
5618     __ align(CodeEntryAlignment);
5619     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
5620     address start = __ pc();
5621     __ enter();
5622 
5623     Register src    = c_rarg0;
5624     Register soff   = c_rarg1;
5625     Register send   = c_rarg2;
5626     Register dst    = c_rarg3;
5627     Register doff   = c_rarg4;
5628     Register isURL  = c_rarg5;
5629     Register isMIME = c_rarg6;
5630 
5631     Register codec     = c_rarg7;
5632     Register dstBackup = t6;
5633     Register length    = t3;     // total length of src data in bytes
5634 
5635     Label ProcessData, Exit;
5636     Label ProcessScalar, ScalarLoop;
5637 
5638     // passed in length (send - soff) is guaranteed to be > 4,
5639     // and in this intrinsic we only process data of length in multiple of 4,
5640     // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
5641     __ sub(length, send, soff);
5642     __ andi(length, length, -4);
5643     // real src/dst to process data
5644     __ add(src, src, soff);
5645     __ add(dst, dst, doff);
5646     // backup of dst, used to calculate the return value at exit
5647     __ mv(dstBackup, dst);
5648 
5649     // load the codec base address
5650     __ la(codec, ExternalAddress((address) fromBase64));
5651     __ beqz(isURL, ProcessData);
5652     __ la(codec, ExternalAddress((address) fromBase64URL));
5653     __ BIND(ProcessData);
5654 
5655     // vector version
5656     if (UseRVV) {
5657       // for MIME case, it has a default length limit of 76 which could be
5658       // different(smaller) from (send - soff), so in MIME case, we go through
5659       // the scalar code path directly.
5660       __ bnez(isMIME, ScalarLoop);
5661 
5662       Label ProcessM1, ProcessM2;
5663 
5664       Register failedIdx = soff;
5665       Register stepSrcM1 = send;
5666       Register stepSrcM2 = doff;
5667       Register stepDst   = isURL;
5668       Register size      = t4;
5669 
5670       __ mv(size, MaxVectorSize * 2);
5671       __ mv(stepSrcM1, MaxVectorSize * 4);
5672       __ slli(stepSrcM2, stepSrcM1, 1);
5673       __ mv(stepDst, MaxVectorSize * 2 * 3);
5674 
5675       __ blt(length, stepSrcM2, ProcessM1);
5676 
5677 
5678       // Assembler::m2
5679       __ BIND(ProcessM2);
5680       base64_vector_decode_round(src, dst, codec,
5681                     size, stepSrcM2, stepDst, failedIdx,
5682                     v2, v4, v6, v8,      // inputs
5683                     v10, v12, v14, v16,  // indexes
5684                     v18, v20, v22,       // outputs
5685                     Assembler::m2);
5686       __ sub(length, length, stepSrcM2);
5687 
5688       // error check
5689       // valid value of failedIdx can only be -1 when < 0
5690       __ bgez(failedIdx, Exit);
5691 
5692       __ bge(length, stepSrcM2, ProcessM2);
5693 
5694 
5695       // Assembler::m1
5696       __ BIND(ProcessM1);
5697       __ blt(length, stepSrcM1, ProcessScalar);
5698 
5699       __ srli(size, size, 1);
5700       __ srli(stepDst, stepDst, 1);
5701       base64_vector_decode_round(src, dst, codec,
5702                     size, stepSrcM1, stepDst, failedIdx,
5703                     v1, v2, v3, v4,      // inputs
5704                     v5, v6, v7, v8,      // indexes
5705                     v9, v10, v11,        // outputs
5706                     Assembler::m1);
5707       __ sub(length, length, stepSrcM1);
5708 
5709       // error check
5710       // valid value of failedIdx can only be -1 when < 0
5711       __ bgez(failedIdx, Exit);
5712 
5713       __ BIND(ProcessScalar);
5714       __ beqz(length, Exit);
5715     }
5716 
5717     // scalar version
5718     {
5719       Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
5720       Register combined32Bits = t4;
5721 
5722       // encoded:   [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
5723       // plain:     [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
5724       __ BIND(ScalarLoop);
5725 
5726       // load 4 bytes encoded src data
5727       __ lbu(byte0, Address(src, 0));
5728       __ lbu(byte1, Address(src, 1));
5729       __ lbu(byte2, Address(src, 2));
5730       __ lbu(byte3, Address(src, 3));
5731       __ addi(src, src, 4);
5732 
5733       // get codec index and decode (ie. load from codec by index)
5734       __ add(byte0, codec, byte0);
5735       __ add(byte1, codec, byte1);
5736       __ lb(byte0, Address(byte0, 0));
5737       __ lb(byte1, Address(byte1, 0));
5738       __ add(byte2, codec, byte2);
5739       __ add(byte3, codec, byte3);
5740       __ lb(byte2, Address(byte2, 0));
5741       __ lb(byte3, Address(byte3, 0));
5742       __ slliw(byte0, byte0, 18);
5743       __ slliw(byte1, byte1, 12);
5744       __ orr(byte0, byte0, byte1);
5745       __ orr(byte0, byte0, byte3);
5746       __ slliw(byte2, byte2, 6);
5747       // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
5748       //  1. error check below
5749       //  2. decode below
5750       __ orr(combined32Bits, byte0, byte2);
5751 
5752       // error check
5753       __ bltz(combined32Bits, Exit);
5754 
5755       // store 3 bytes decoded data
5756       __ sraiw(byte0, combined32Bits, 16);
5757       __ sraiw(byte1, combined32Bits, 8);
5758       __ sb(byte0, Address(dst, 0));
5759       __ sb(byte1, Address(dst, 1));
5760       __ sb(combined32Bits, Address(dst, 2));
5761 
5762       __ sub(length, length, 4);
5763       __ addi(dst, dst, 3);
5764       // loop back
5765       __ bnez(length, ScalarLoop);
5766     }
5767 
5768     __ BIND(Exit);
5769     __ sub(c_rarg0, dst, dstBackup);
5770 
5771     __ leave();
5772     __ ret();
5773 
5774     return (address) start;
5775   }
5776 
5777   void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
5778     VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
5779     Register temp0, Register temp1, Register temp2,  Register temp3,
5780     VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
5781 
5782     assert((lmul == Assembler::m4 && step == 64) ||
5783            (lmul == Assembler::m2 && step == 32) ||
5784            (lmul == Assembler::m1 && step == 16),
5785            "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
5786     // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
5787     // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
5788     // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
5789     // In non-vectorized code, we update s1 and s2 as:
5790     //   s1 <- s1 + b1
5791     //   s2 <- s2 + s1
5792     //   s1 <- s1 + b2
5793     //   s2 <- s2 + b1
5794     //   ...
5795     //   s1 <- s1 + b64
5796     //   s2 <- s2 + s1
5797     // Putting above assignments together, we have:
5798     //   s1_new = s1 + b1 + b2 + ... + b64
5799     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
5800     //          = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
5801     //          = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
5802 
5803     __ mv(temp3, step);
5804     // Load data
5805     __ vsetvli(temp0, temp3, Assembler::e8, lmul);
5806     __ vle8_v(vbytes, buff);
5807     __ addi(buff, buff, step);
5808 
5809     // Upper bound reduction sum for s1_new:
5810     // 0xFF * 64 = 0x3FC0, so:
5811     // 1. Need to do vector-widening reduction sum
5812     // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
5813     __ vwredsumu_vs(vs1acc, vbytes, vzero);
5814     // Multiplication for s2_new
5815     __ vwmulu_vv(vs2acc, vtable, vbytes);
5816 
5817     // s2 = s2 + s1 * log2(step)
5818     __ slli(temp1, s1, exact_log2(step));
5819     __ add(s2, s2, temp1);
5820 
5821     // Summing up calculated results for s2_new
5822     if (MaxVectorSize > 16) {
5823       __ vsetvli(temp0, temp3, Assembler::e16, lmul);
5824     } else {
5825       // Half of vector-widening multiplication result is in successor of vs2acc
5826       // group for vlen == 16, in which case we need to double vector register
5827       // group width in order to reduction sum all of them
5828       Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
5829                                (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
5830       __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
5831     }
5832     // Upper bound for reduction sum:
5833     // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
5834     // 1. Need to do vector-widening reduction sum
5835     // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
5836     __ vwredsumu_vs(vtemp1, vs2acc, vzero);
5837 
5838     // Extracting results for:
5839     // s1_new
5840     __ vmv_x_s(temp0, vs1acc);
5841     __ add(s1, s1, temp0);
5842     // s2_new
5843     __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
5844     __ vmv_x_s(temp1, vtemp1);
5845     __ add(s2, s2, temp1);
5846   }
5847 
5848   /***
5849    *  int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
5850    *
5851    *  Arguments:
5852    *
5853    *  Inputs:
5854    *   c_rarg0   - int   adler
5855    *   c_rarg1   - byte* buff (b + off)
5856    *   c_rarg2   - int   len
5857    *
5858    *  Output:
5859    *   c_rarg0   - int adler result
5860    */
5861   address generate_updateBytesAdler32() {
5862     __ align(CodeEntryAlignment);
5863     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
5864     address start = __ pc();
5865 
5866     Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
5867       L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
5868 
5869     // Aliases
5870     Register adler  = c_rarg0;
5871     Register s1     = c_rarg0;
5872     Register s2     = c_rarg3;
5873     Register buff   = c_rarg1;
5874     Register len    = c_rarg2;
5875     Register nmax  = c_rarg4;
5876     Register base  = c_rarg5;
5877     Register count = c_rarg6;
5878     Register temp0 = t3;
5879     Register temp1 = t4;
5880     Register temp2 = t5;
5881     Register temp3 = t6;
5882 
5883     VectorRegister vzero = v31;
5884     VectorRegister vbytes = v8; // group: v8, v9, v10, v11
5885     VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
5886     VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
5887     VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
5888     VectorRegister vtable_32 = v4; // group: v4, v5
5889     VectorRegister vtable_16 = v30;
5890     VectorRegister vtemp1 = v28;
5891     VectorRegister vtemp2 = v29;
5892 
5893     // Max number of bytes we can process before having to take the mod
5894     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
5895     const uint64_t BASE = 0xfff1;
5896     const uint64_t NMAX = 0x15B0;
5897 
5898     // Loops steps
5899     int step_64 = 64;
5900     int step_32 = 32;
5901     int step_16 = 16;
5902     int step_1  = 1;
5903 
5904     __ enter(); // Required for proper stackwalking of RuntimeStub frame
5905     __ mv(temp1, 64);
5906     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
5907 
5908     // Generating accumulation coefficients for further calculations
5909     // vtable_64:
5910     __ vid_v(vtemp1);
5911     __ vrsub_vx(vtable_64, vtemp1, temp1);
5912     // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
5913 
5914     // vtable_32:
5915     __ mv(temp1, 32);
5916     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
5917     __ vid_v(vtemp1);
5918     __ vrsub_vx(vtable_32, vtemp1, temp1);
5919     // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
5920 
5921     __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
5922     // vtable_16:
5923     __ mv(temp1, 16);
5924     __ vid_v(vtemp1);
5925     __ vrsub_vx(vtable_16, vtemp1, temp1);
5926     // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
5927 
5928     __ vmv_v_i(vzero, 0);
5929 
5930     __ mv(base, BASE);
5931     __ mv(nmax, NMAX);
5932 
5933     // s1 is initialized to the lower 16 bits of adler
5934     // s2 is initialized to the upper 16 bits of adler
5935     __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
5936     __ zero_extend(s1, adler, 16); // s1 = (adler & 0xffff)
5937 
5938     // The pipelined loop needs at least 16 elements for 1 iteration
5939     // It does check this, but it is more effective to skip to the cleanup loop
5940     __ mv(temp0, step_16);
5941     __ bgeu(len, temp0, L_nmax);
5942     __ beqz(len, L_combine);
5943 
5944     // Jumping to L_by1_loop
5945     __ sub(len, len, step_1);
5946     __ j(L_by1_loop);
5947 
5948   __ bind(L_nmax);
5949     __ sub(len, len, nmax);
5950     __ sub(count, nmax, 16);
5951     __ bltz(len, L_by16);
5952 
5953   // Align L_nmax loop by 64
5954   __ bind(L_nmax_loop_entry);
5955     __ sub(count, count, 32);
5956 
5957   __ bind(L_nmax_loop);
5958     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
5959       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5960       vtemp1, vtemp2, step_64, Assembler::m4);
5961     __ sub(count, count, step_64);
5962     __ bgtz(count, L_nmax_loop);
5963 
5964     // There are three iterations left to do
5965     adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
5966       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5967       vtemp1, vtemp2, step_32, Assembler::m2);
5968     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
5969       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5970       vtemp1, vtemp2, step_16, Assembler::m1);
5971 
5972     // s1 = s1 % BASE
5973     __ remuw(s1, s1, base);
5974     // s2 = s2 % BASE
5975     __ remuw(s2, s2, base);
5976 
5977     __ sub(len, len, nmax);
5978     __ sub(count, nmax, 16);
5979     __ bgez(len, L_nmax_loop_entry);
5980 
5981   __ bind(L_by16);
5982     __ add(len, len, count);
5983     __ bltz(len, L_by1);
5984     // Trying to unroll
5985     __ mv(temp3, step_64);
5986     __ blt(len, temp3, L_by16_loop);
5987 
5988   __ bind(L_by16_loop_unroll);
5989     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
5990       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5991       vtemp1, vtemp2, step_64, Assembler::m4);
5992     __ sub(len, len, step_64);
5993     // By now the temp3 should still be 64
5994     __ bge(len, temp3, L_by16_loop_unroll);
5995 
5996   __ bind(L_by16_loop);
5997     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
5998       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5999       vtemp1, vtemp2, step_16, Assembler::m1);
6000     __ sub(len, len, step_16);
6001     __ bgez(len, L_by16_loop);
6002 
6003   __ bind(L_by1);
6004     __ add(len, len, 15);
6005     __ bltz(len, L_do_mod);
6006 
6007   __ bind(L_by1_loop);
6008     __ lbu(temp0, Address(buff, 0));
6009     __ addi(buff, buff, step_1);
6010     __ add(s1, temp0, s1);
6011     __ add(s2, s2, s1);
6012     __ sub(len, len, step_1);
6013     __ bgez(len, L_by1_loop);
6014 
6015   __ bind(L_do_mod);
6016     // s1 = s1 % BASE
6017     __ remuw(s1, s1, base);
6018     // s2 = s2 % BASE
6019     __ remuw(s2, s2, base);
6020 
6021     // Combine lower bits and higher bits
6022     // adler = s1 | (s2 << 16)
6023   __ bind(L_combine);
6024     __ slli(s2, s2, 16);
6025     __ orr(s1, s1, s2);
6026 
6027     __ leave(); // Required for proper stackwalking of RuntimeStub frame
6028     __ ret();
6029 
6030     return start;
6031   }
6032 
6033 #endif // COMPILER2_OR_JVMCI
6034 
6035 #ifdef COMPILER2
6036 
6037 static const int64_t right_2_bits = right_n_bits(2);
6038 static const int64_t right_3_bits = right_n_bits(3);
6039 
6040   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
6041   // are represented as long[5], with BITS_PER_LIMB = 26.
6042   // Pack five 26-bit limbs into three 64-bit registers.
6043   void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
6044     assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
6045 
6046     // The goal is to have 128-bit value in dest2:dest1:dest0
6047     __ ld(dest0, Address(src, 0));    // 26 bits in dest0
6048 
6049     __ ld(tmp1, Address(src, sizeof(jlong)));
6050     __ slli(tmp1, tmp1, 26);
6051     __ add(dest0, dest0, tmp1);       // 52 bits in dest0
6052 
6053     __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
6054     __ slli(tmp1, tmp2, 52);
6055     __ add(dest0, dest0, tmp1);       // dest0 is full
6056 
6057     __ srli(dest1, tmp2, 12);         // 14-bit in dest1
6058 
6059     __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
6060     __ slli(tmp1, tmp1, 14);
6061     __ add(dest1, dest1, tmp1);       // 40-bit in dest1
6062 
6063     __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
6064     __ slli(tmp2, tmp1, 40);
6065     __ add(dest1, dest1, tmp2);       // dest1 is full
6066 
6067     if (dest2->is_valid()) {
6068       __ srli(tmp1, tmp1, 24);
6069       __ mv(dest2, tmp1);               // 2 bits in dest2
6070     } else {
6071 #ifdef ASSERT
6072       Label OK;
6073       __ srli(tmp1, tmp1, 24);
6074       __ beq(zr, tmp1, OK);           // 2 bits
6075       __ stop("high bits of Poly1305 integer should be zero");
6076       __ should_not_reach_here();
6077       __ bind(OK);
6078 #endif
6079     }
6080   }
6081 
6082   // As above, but return only a 128-bit integer, packed into two
6083   // 64-bit registers.
6084   void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
6085     poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
6086   }
6087 
6088   // U_2:U_1:U_0: += (U_2 >> 2) * 5
6089   void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
6090     assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
6091 
6092     // First, U_2:U_1:U_0 += (U_2 >> 2)
6093     __ srli(tmp1, U_2, 2);
6094     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6095     __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
6096     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6097     __ add(U_2, U_2, tmp2);
6098 
6099     // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
6100     __ slli(tmp1, tmp1, 2);
6101     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6102     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6103     __ add(U_2, U_2, tmp2);
6104   }
6105 
6106   // Poly1305, RFC 7539
6107   // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
6108 
6109   // Arguments:
6110   //    c_rarg0:   input_start -- where the input is stored
6111   //    c_rarg1:   length
6112   //    c_rarg2:   acc_start -- where the output will be stored
6113   //    c_rarg3:   r_start -- where the randomly generated 128-bit key is stored
6114 
6115   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
6116   // description of the tricks used to simplify and accelerate this
6117   // computation.
6118 
6119   address generate_poly1305_processBlocks() {
6120     __ align(CodeEntryAlignment);
6121     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
6122     address start = __ pc();
6123     __ enter();
6124     Label here;
6125 
6126     RegSet saved_regs = RegSet::range(x18, x21);
6127     RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
6128     __ push_reg(saved_regs, sp);
6129 
6130     // Arguments
6131     const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
6132 
6133     // R_n is the 128-bit randomly-generated key, packed into two
6134     // registers. The caller passes this key to us as long[5], with
6135     // BITS_PER_LIMB = 26.
6136     const Register R_0 = *regs, R_1 = *++regs;
6137     poly1305_pack_26(R_0, R_1, r_start, t1, t2);
6138 
6139     // RR_n is (R_n >> 2) * 5
6140     const Register RR_0 = *++regs, RR_1 = *++regs;
6141     __ srli(t1, R_0, 2);
6142     __ shadd(RR_0, t1, t1, t2, 2);
6143     __ srli(t1, R_1, 2);
6144     __ shadd(RR_1, t1, t1, t2, 2);
6145 
6146     // U_n is the current checksum
6147     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6148     poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6149 
6150     static constexpr int BLOCK_LENGTH = 16;
6151     Label DONE, LOOP;
6152 
6153     __ mv(t1, BLOCK_LENGTH);
6154     __ blt(length, t1, DONE); {
6155       __ bind(LOOP);
6156 
6157       // S_n is to be the sum of U_n and the next block of data
6158       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
6159       __ ld(S_0, Address(input_start, 0));
6160       __ ld(S_1, Address(input_start, wordSize));
6161 
6162       __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
6163       __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
6164       __ add(S_2, U_2, t1);
6165 
6166       __ addi(S_2, S_2, 1);
6167 
6168       const Register U_0HI = *++regs, U_1HI = *++regs;
6169 
6170       // NB: this logic depends on some of the special properties of
6171       // Poly1305 keys. In particular, because we know that the top
6172       // four bits of R_0 and R_1 are zero, we can add together
6173       // partial products without any risk of needing to propagate a
6174       // carry out.
6175       __ wide_mul(U_0, U_0HI, S_0, R_0);
6176       __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
6177       __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
6178 
6179       __ wide_mul(U_1, U_1HI, S_0, R_1);
6180       __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
6181       __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
6182 
6183       __ andi(U_2, R_0, right_2_bits);
6184       __ mul(U_2, S_2, U_2);
6185 
6186       // Partial reduction mod 2**130 - 5
6187       __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
6188       __ adc(U_2, U_2, U_1HI, t1);
6189       // Sum is now in U_2:U_1:U_0.
6190 
6191       // U_2:U_1:U_0: += (U_2 >> 2) * 5
6192       poly1305_reduce(U_2, U_1, U_0, t1, t2);
6193 
6194       __ sub(length, length, BLOCK_LENGTH);
6195       __ addi(input_start, input_start, BLOCK_LENGTH);
6196       __ mv(t1, BLOCK_LENGTH);
6197       __ bge(length, t1, LOOP);
6198     }
6199 
6200     // Further reduce modulo 2^130 - 5
6201     poly1305_reduce(U_2, U_1, U_0, t1, t2);
6202 
6203     // Unpack the sum into five 26-bit limbs and write to memory.
6204     // First 26 bits is the first limb
6205     __ slli(t1, U_0, 38); // Take lowest 26 bits
6206     __ srli(t1, t1, 38);
6207     __ sd(t1, Address(acc_start)); // First 26-bit limb
6208 
6209     // 27-52 bits of U_0 is the second limb
6210     __ slli(t1, U_0, 12); // Take next 27-52 bits
6211     __ srli(t1, t1, 38);
6212     __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
6213 
6214     // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
6215     __ srli(t1, U_0, 52);
6216     __ slli(t2, U_1, 50);
6217     __ srli(t2, t2, 38);
6218     __ add(t1, t1, t2);
6219     __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
6220 
6221     // Storing 15-40 bits of U_1
6222     __ slli(t1, U_1, 24); // Already used up 14 bits
6223     __ srli(t1, t1, 38); // Clear all other bits from t1
6224     __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
6225 
6226     // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
6227     __ srli(t1, U_1, 40);
6228     __ andi(t2, U_2, right_3_bits);
6229     __ slli(t2, t2, 24);
6230     __ add(t1, t1, t2);
6231     __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
6232 
6233     __ bind(DONE);
6234     __ pop_reg(saved_regs, sp);
6235     __ leave(); // Required for proper stackwalking
6236     __ ret();
6237 
6238     return start;
6239   }
6240 
6241   void generate_vector_math_stubs() {
6242     if (!UseRVV) {
6243       log_info(library)("vector is not supported, skip loading vector math (sleef) library!");
6244       return;
6245     }
6246 
6247     // Get native vector math stub routine addresses
6248     void* libsleef = nullptr;
6249     char ebuf[1024];
6250     char dll_name[JVM_MAXPATHLEN];
6251     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
6252       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
6253     }
6254     if (libsleef == nullptr) {
6255       log_info(library)("Failed to load native vector math (sleef) library, %s!", ebuf);
6256       return;
6257     }
6258 
6259     // Method naming convention
6260     //   All the methods are named as <OP><T>_<U><suffix>
6261     //
6262     //   Where:
6263     //     <OP>     is the operation name, e.g. sin, cos
6264     //     <T>      is to indicate float/double
6265     //              "fx/dx" for vector float/double operation
6266     //     <U>      is the precision level
6267     //              "u10/u05" represents 1.0/0.5 ULP error bounds
6268     //               We use "u10" for all operations by default
6269     //               But for those functions do not have u10 support, we use "u05" instead
6270     //     <suffix> rvv, indicates riscv vector extension
6271     //
6272     //   e.g. sinfx_u10rvv is the method for computing vector float sin using rvv instructions
6273     //
6274     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
6275 
6276     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
6277       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
6278       if (vop == VectorSupport::VECTOR_OP_TANH) { // skip tanh because of performance regression
6279         continue;
6280       }
6281 
6282       // The native library does not support u10 level of "hypot".
6283       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
6284 
6285       snprintf(ebuf, sizeof(ebuf), "%sfx_%srvv", VectorSupport::mathname[op], ulf);
6286       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
6287 
6288       snprintf(ebuf, sizeof(ebuf), "%sdx_%srvv", VectorSupport::mathname[op], ulf);
6289       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
6290     }
6291   }
6292 
6293 #endif // COMPILER2
6294 
6295   /**
6296    *  Arguments:
6297    *
6298    * Inputs:
6299    *   c_rarg0   - int crc
6300    *   c_rarg1   - byte* buf
6301    *   c_rarg2   - int length
6302    *
6303    * Output:
6304    *   c_rarg0   - int crc result
6305    */
6306   address generate_updateBytesCRC32() {
6307     assert(UseCRC32Intrinsics, "what are we doing here?");
6308 
6309     __ align(CodeEntryAlignment);
6310     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6311 
6312     address start = __ pc();
6313 
6314     // input parameters
6315     const Register crc    = c_rarg0;  // crc
6316     const Register buf    = c_rarg1;  // source java byte array address
6317     const Register len    = c_rarg2;  // length
6318 
6319     BLOCK_COMMENT("Entry:");
6320     __ enter(); // required for proper stackwalking of RuntimeStub frame
6321 
6322     __ kernel_crc32(crc, buf, len,
6323                     c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
6324                     c_rarg7, t2, t3, t4, t5, t6);       // misc tmps
6325 
6326     __ leave(); // required for proper stackwalking of RuntimeStub frame
6327     __ ret();
6328 
6329     return start;
6330   }
6331 
6332   // exception handler for upcall stubs
6333   address generate_upcall_stub_exception_handler() {
6334     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
6335     address start = __ pc();
6336 
6337     // Native caller has no idea how to handle exceptions,
6338     // so we just crash here. Up to callee to catch exceptions.
6339     __ verify_oop(x10); // return a exception oop in a0
6340     __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
6341     __ should_not_reach_here();
6342 
6343     return start;
6344   }
6345 
6346   // load Method* target of MethodHandle
6347   // j_rarg0 = jobject receiver
6348   // xmethod = Method* result
6349   address generate_upcall_stub_load_target() {
6350 
6351     StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target");
6352     address start = __ pc();
6353 
6354     __ resolve_global_jobject(j_rarg0, t0, t1);
6355       // Load target method from receiver
6356     __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
6357     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
6358     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
6359     __ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
6360                       Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
6361                       noreg, noreg);
6362     __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
6363 
6364     __ ret();
6365 
6366     return start;
6367   }
6368 
6369 #undef __
6370 
6371   // Initialization
6372   void generate_initial_stubs() {
6373     // Generate initial stubs and initializes the entry points
6374 
6375     // entry points that exist in all platforms Note: This is code
6376     // that could be shared among different platforms - however the
6377     // benefit seems to be smaller than the disadvantage of having a
6378     // much more complicated generator structure. See also comment in
6379     // stubRoutines.hpp.
6380 
6381     StubRoutines::_forward_exception_entry = generate_forward_exception();
6382 
6383     if (UnsafeMemoryAccess::_table == nullptr) {
6384       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
6385     }
6386 
6387     StubRoutines::_call_stub_entry =
6388       generate_call_stub(StubRoutines::_call_stub_return_address);
6389 
6390     // is referenced by megamorphic call
6391     StubRoutines::_catch_exception_entry = generate_catch_exception();
6392 
6393     if (UseCRC32Intrinsics) {
6394       // set table address before stub generation which use it
6395       StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table;
6396       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6397     }
6398   }
6399 
6400   void generate_continuation_stubs() {
6401     // Continuation stubs:
6402     StubRoutines::_cont_thaw             = generate_cont_thaw();
6403     StubRoutines::_cont_returnBarrier    = generate_cont_returnBarrier();
6404     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();

6405   }
6406 
6407   void generate_final_stubs() {
6408     // support for verify_oop (must happen after universe_init)
6409     if (VerifyOops) {
6410       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6411     }
6412 
6413     // arraycopy stubs used by compilers
6414     generate_arraycopy_stubs();
6415 
6416     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6417     if (bs_nm != nullptr) {
6418       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
6419     }
6420 
6421 #ifdef COMPILER2
6422     if (UseSecondarySupersTable) {
6423       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
6424       if (!InlineSecondarySupersTest) {
6425         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
6426           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
6427             = generate_lookup_secondary_supers_table_stub(slot);
6428         }
6429       }
6430     }
6431 #endif // COMPILER2
6432 
6433     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
6434     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
6435 
6436     StubRoutines::riscv::set_completed();
6437   }
6438 
6439   void generate_compiler_stubs() {
6440 #ifdef COMPILER2
6441     if (UseMulAddIntrinsic) {
6442       StubRoutines::_mulAdd = generate_mulAdd();
6443     }
6444 
6445     if (UseMultiplyToLenIntrinsic) {
6446       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6447     }
6448 
6449     if (UseSquareToLenIntrinsic) {
6450       StubRoutines::_squareToLen = generate_squareToLen();
6451     }
6452 
6453     if (UseMontgomeryMultiplyIntrinsic) {
6454       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
6455       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
6456       StubRoutines::_montgomeryMultiply = g.generate_multiply();
6457     }
6458 
6459     if (UseMontgomerySquareIntrinsic) {
6460       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
6461       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
6462       StubRoutines::_montgomerySquare = g.generate_square();
6463     }
6464 
6465     if (UseAESIntrinsics) {
6466       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6467       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6468     }
6469 
6470     if (UsePoly1305Intrinsics) {
6471       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
6472     }
6473 
6474     if (UseRVVForBigIntegerShiftIntrinsics) {
6475       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
6476       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
6477     }
6478 
6479     if (UseSHA256Intrinsics) {
6480       Sha2Generator sha2(_masm, this);
6481       StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(false);
6482       StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true);
6483     }
6484 
6485     if (UseSHA512Intrinsics) {
6486       Sha2Generator sha2(_masm, this);
6487       StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(false);
6488       StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true);
6489     }
6490 
6491     if (UseMD5Intrinsics) {
6492       StubRoutines::_md5_implCompress   = generate_md5_implCompress(false, "md5_implCompress");
6493       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true,  "md5_implCompressMB");
6494     }
6495 
6496     if (UseChaCha20Intrinsics) {
6497       StubRoutines::_chacha20Block = generate_chacha20Block();
6498     }
6499 
6500     if (UseSHA1Intrinsics) {
6501       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false, "sha1_implCompress");
6502       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true, "sha1_implCompressMB");
6503     }
6504 
6505     if (UseBASE64Intrinsics) {
6506       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6507       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
6508     }
6509 
6510     if (UseAdler32Intrinsics) {
6511       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6512     }
6513 
6514     generate_compare_long_strings();
6515 
6516     generate_string_indexof_stubs();
6517 
6518     generate_vector_math_stubs();
6519 
6520 #endif // COMPILER2
6521   }
6522 
6523  public:
6524   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
6525     switch(kind) {
6526     case Initial_stubs:
6527       generate_initial_stubs();
6528       break;
6529      case Continuation_stubs:
6530       generate_continuation_stubs();
6531       break;
6532     case Compiler_stubs:
6533       generate_compiler_stubs();
6534       break;
6535     case Final_stubs:
6536       generate_final_stubs();
6537       break;
6538     default:
6539       fatal("unexpected stubs kind: %d", kind);
6540       break;
6541     };
6542   }
6543 }; // end class declaration
6544 
6545 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
6546   StubGenerator g(code, kind);
6547 }
--- EOF ---